output:

md_document:

variant: markdown_github

setwd("~/HomeWork/Practice")
devtools::install_github("kassambara/datarium")
Skipping install of 'datarium' from a github remote, the SHA1 (f9f9b3a6) has not changed since last install.
  Use `force = TRUE` to force installation
data("marketing", package="datarium")
head(marketing)
data("swiss")
head(swiss)
data("Boston", package="MASS")
head(Boston)

Chapter 3 Regression

Loading Required R packages

library(tidyverse)
library(caret)
theme_set(theme_bw())

Preparing the data

# Load the data
data("marketing", package = "datarium")
# Inspect th data
sample_n(marketing,3)

Split the data into training and test set

set.seed(123)
training.samples <- marketing$sales %>% createDataPartition(p=0.8, list = FALSE)

train.data <- marketing[training.samples,]
test.data <- marketing[-training.samples,]
summary(train.data)
    youtube          facebook       newspaper          sales      
 Min.   :  0.84   Min.   : 0.00   Min.   :  0.36   Min.   : 1.92  
 1st Qu.: 85.95   1st Qu.:11.61   1st Qu.: 19.11   1st Qu.:12.48  
 Median :196.08   Median :25.44   Median : 35.16   Median :15.48  
 Mean   :179.24   Mean   :27.51   Mean   : 39.07   Mean   :16.77  
 3rd Qu.:264.54   3rd Qu.:43.89   3rd Qu.: 55.02   3rd Qu.:20.88  
 Max.   :355.68   Max.   :59.52   Max.   :136.80   Max.   :32.40  
dim(train.data)
[1] 162   4
dim(test.data)
[1] 38  4

Computing linear regression in r

model <- lm(sales ~., data = train.data)
summary(model)

Call:
lm(formula = sales ~ ., data = train.data)

Residuals:
     Min       1Q   Median       3Q      Max 
-10.4122  -1.1101   0.3475   1.4218   3.4987 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 3.391883   0.440622   7.698 1.41e-12 ***
youtube     0.045574   0.001592  28.630  < 2e-16 ***
facebook    0.186941   0.009888  18.905  < 2e-16 ***
newspaper   0.001786   0.006773   0.264    0.792    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2.119 on 158 degrees of freedom
Multiple R-squared:  0.8902,    Adjusted R-squared:  0.8881 
F-statistic: 426.9 on 3 and 158 DF,  p-value: < 2.2e-16
# Make predictios
predictions <- model %>% predict(test.data)

# Model Performance
# (a) Prediction error, RMSE
RMSE(predictions, test.data$sales)
[1] 1.590096
# (b) R-square
R2(predictions, test.data$sales)
[1] 0.9380644

Non-Linear Regression

data("Boston", package = "MASS")
set.seed(123)

head(Boston)
training1.samples <- Boston$medv %>% createDataPartition(p=0.8, list = FALSE)

train1.data <- Boston[training.samples,]
test1.data <- Boston[-training.samples, ]

ggplot(train1.data, aes(lstat, medv)) +
  geom_point() +
  stat_smooth()

Let’s start with fitting a linear regression model

model.lm <- lm(medv~lstat, data = train1.data)
summary(model.lm)

Call:
lm(formula = medv ~ lstat, data = train1.data)

Residuals:
   Min     1Q Median     3Q    Max 
-7.889 -3.805 -1.656  2.059 19.972 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 33.98512    0.88151   38.55   <2e-16 ***
lstat       -0.88925    0.06524  -13.63   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 5.548 on 160 degrees of freedom
Multiple R-squared:  0.5373,    Adjusted R-squared:  0.5344 
F-statistic: 185.8 on 1 and 160 DF,  p-value: < 2.2e-16
prediction <- model.lm %>% predict(test1.data)

data.frame(RMSE = RMSE(prediction, test1.data$medv),
           R2 = R2(prediction, test1.data$medv))

Let’s visualize the data

ggplot(train1.data, aes(lstat, medv)) +
  geom_point() +
  stat_smooth(method = lm, formula = y ~ x)

Polynomial regression

medv = b0 + b1 ∗ lstat + b2 ∗ lstat2

mod2 <- lm(medv ~ lstat + I(lstat*lstat), data = train1.data)
summary(mod2)

Call:
lm(formula = medv ~ lstat + I(lstat * lstat), data = train1.data)

Residuals:
    Min      1Q  Median      3Q     Max 
-8.3393 -3.2427 -0.5934  2.0975 16.7311 

Coefficients:
                  Estimate Std. Error t value Pr(>|t|)    
(Intercept)      43.432475   1.274260  34.084  < 2e-16 ***
lstat            -2.520720   0.189206 -13.323  < 2e-16 ***
I(lstat * lstat)  0.053205   0.005921   8.987 7.09e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 4.532 on 159 degrees of freedom
Multiple R-squared:  0.6932,    Adjusted R-squared:  0.6893 
F-statistic: 179.6 on 2 and 159 DF,  p-value: < 2.2e-16
prediction2 <- mod2 %>% predict(test1.data)
data.frame(
  RMSE = RMSE(prediction2, test1.data$medv),
  R2 = R2(prediction2, test1.data$medv)
)
mod22 <- lm(medv ~ poly(lstat,2), data = train1.data)
summary(mod22)

Call:
lm(formula = medv ~ poly(lstat, 2), data = train1.data)

Residuals:
    Min      1Q  Median      3Q     Max 
-8.3393 -3.2427 -0.5934  2.0975 16.7311 

Coefficients:
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)      23.5407     0.3561  66.116  < 2e-16 ***
poly(lstat, 2)1 -75.6187     4.5318 -16.686  < 2e-16 ***
poly(lstat, 2)2  40.7253     4.5318   8.987 7.09e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 4.532 on 159 degrees of freedom
Multiple R-squared:  0.6932,    Adjusted R-squared:  0.6893 
F-statistic: 179.6 on 2 and 159 DF,  p-value: < 2.2e-16
prediction22 <- mod22 %>% predict(test1.data)
data.frame(
  RMSE = RMSE(prediction22, test1.data$medv),
  R2 = R2(prediction22, test1.data$medv)
)
ggplot(train1.data, aes(lstat, medv)) +
  geom_point() +
  stat_smooth(method = lm, formula = y ~ poly(x,2))

ggplot(train1.data, aes(lstat, medv)) +
  geom_point() +
  stat_smooth(method = lm, formula = y ~ (x + I(x*x)))

mod6 <- lm(medv ~ poly(lstat, 6), data = train1.data)
summary(mod6)

Call:
lm(formula = medv ~ poly(lstat, 6), data = train1.data)

Residuals:
     Min       1Q   Median       3Q      Max 
-12.4412  -2.3739  -0.7595   1.7344  15.9725 

Coefficients:
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)      23.5407     0.3304  71.245  < 2e-16 ***
poly(lstat, 6)1 -75.6187     4.2055 -17.981  < 2e-16 ***
poly(lstat, 6)2  40.7253     4.2055   9.684  < 2e-16 ***
poly(lstat, 6)3 -16.9822     4.2055  -4.038 8.45e-05 ***
poly(lstat, 6)4  12.1006     4.2055   2.877  0.00458 ** 
poly(lstat, 6)5  -9.1610     4.2055  -2.178  0.03089 *  
poly(lstat, 6)6  -2.2956     4.2055  -0.546  0.58595    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 4.206 on 155 degrees of freedom
Multiple R-squared:  0.7424,    Adjusted R-squared:  0.7324 
F-statistic: 74.45 on 6 and 155 DF,  p-value: < 2.2e-16

As we can see in above model summary the polynomial term beyond 5 is not significant

Let’s visualize model

ggplot(train1.data, aes(lstat, medv)) +
  geom_point() +
  stat_smooth(method = lm, formula = y ~ poly(x,6))

mod5 <- lm(medv ~ poly(lstat, 5), data = train1.data)
summary(mod5)

Call:
lm(formula = medv ~ poly(lstat, 5), data = train1.data)

Residuals:
     Min       1Q   Median       3Q      Max 
-13.0370  -2.4259  -0.7941   1.6414  16.1361 

Coefficients:
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)      23.5407     0.3297  71.406  < 2e-16 ***
poly(lstat, 5)1 -75.6187     4.1961 -18.021  < 2e-16 ***
poly(lstat, 5)2  40.7253     4.1961   9.706  < 2e-16 ***
poly(lstat, 5)3 -16.9822     4.1961  -4.047 8.14e-05 ***
poly(lstat, 5)4  12.1006     4.1961   2.884  0.00448 ** 
poly(lstat, 5)5  -9.1610     4.1961  -2.183  0.03051 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 4.196 on 156 degrees of freedom
Multiple R-squared:  0.7419,    Adjusted R-squared:  0.7336 
F-statistic: 89.69 on 5 and 156 DF,  p-value: < 2.2e-16
ggplot(train1.data, aes(lstat, medv)) +
  geom_point() +
  stat_smooth(method = lm, formula = y ~ poly(x,5))

Using log transformation

mod.log <- lm(medv ~ log(lstat), data = train1.data)
summary(mod.log)

Call:
lm(formula = medv ~ log(lstat), data = train1.data)

Residuals:
   Min     1Q Median     3Q    Max 
-7.610 -2.975 -1.048  1.886 17.093 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  50.2130     1.4077   35.67   <2e-16 ***
log(lstat)  -11.5918     0.5928  -19.55   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 4.43 on 160 degrees of freedom
Multiple R-squared:  0.705, Adjusted R-squared:  0.7032 
F-statistic: 382.4 on 1 and 160 DF,  p-value: < 2.2e-16
prediction.log <- mod.log %>% predict(test1.data)

data.frame(
  RMSE = RMSE(prediction.log, test1.data$medv),
  R2 = R2(prediction.log, test1.data$medv)
)
ggplot(data = train1.data, aes(lstat, medv))+
  geom_point() +
  stat_smooth(method = lm, formula = y~log(x))

Spline Regression

library(splines)
knots <- quantile(train1.data$lstat, p = c(0.25, 0.5, 0.75))
model.spline <- lm(medv ~bs(lstat,knots=knots), data = train1.data)
summary(model.spline)

Call:
lm(formula = medv ~ bs(lstat, knots = knots), data = train1.data)

Residuals:
    Min      1Q  Median      3Q     Max 
-9.0823 -2.0512 -0.6894  1.8034 15.8245 

Coefficients:
                          Estimate Std. Error t value Pr(>|t|)    
(Intercept)                 44.032      2.643  16.658  < 2e-16 ***
bs(lstat, knots = knots)1   -1.127      4.160  -0.271    0.787    
bs(lstat, knots = knots)2  -22.744      2.831  -8.034 2.23e-13 ***
bs(lstat, knots = knots)3  -19.776      3.102  -6.376 1.98e-09 ***
bs(lstat, knots = knots)4  -33.442      3.570  -9.366  < 2e-16 ***
bs(lstat, knots = knots)5  -27.995      4.901  -5.713 5.53e-08 ***
bs(lstat, knots = knots)6  -29.583      4.500  -6.574 7.08e-10 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 4.097 on 155 degrees of freedom
Multiple R-squared:  0.7555,    Adjusted R-squared:  0.746 
F-statistic: 79.82 on 6 and 155 DF,  p-value: < 2.2e-16
prediction.spline <- model.spline %>% predict(test1.data)
some 'x' values beyond boundary knots may cause ill-conditioned bases
data.frame(
  RMSE = RMSE(prediction.spline, test1.data$medv),
  R2 = R2(prediction.spline, test1.data$medv)
)
ggplot(train1.data, aes(lstat, medv)) + 
  geom_point() + 
  stat_smooth(method = lm, formula = y~bs(x,df=3))

Generalized additive models

library(mgcv)
Loading required package: nlme

Attaching package: ‘nlme’

The following object is masked from ‘package:dplyr’:

    collapse

This is mgcv 1.8-27. For overview type 'help("mgcv-package")'.
model.additive <- gam(medv ~ s(lstat), data = train1.data)
summary(model.additive)

Family: gaussian 
Link function: identity 

Formula:
medv ~ s(lstat)

Parametric coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  23.5407     0.3261   72.18   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Approximate significance of smooth terms:
           edf Ref.df     F p-value    
s(lstat) 5.825   7.01 65.16  <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

R-sq.(adj) =  0.739   Deviance explained = 74.9%
GCV =  17.99  Scale est. = 17.232    n = 162
prediction.additive <- model.additive %>% predict(test1.data)

data.frame(
  RMSE = RMSE(prediction.additive, test1.data$medv),
  R2 = R2(prediction.additive, test1.data$medv)
)
ggplot(train1.data, aes(lstat, medv)) +
  geom_point() +
  stat_smooth(method = gam, formula = y ~ s(x))

Regression Model Accuracy Metrics

Model performance metrics

In regression model, the most commonly known evaluation metrics include:

  1. R-squared (R2), which is the proportion of variation in the outcome that is explained by the predictor variables. In multiple regression models, R2 corresponds to the squared correlation between the observed outcome values and the predicted values by the model. The Higer the R-squared, the better the model.

  2. Root Mean Squared Error (RMSE), which measures the average error performed by the model in the predicting the outcome for an observation. Mathematically, the RMSE is the square root of the mean squared error (MSE), which is the average squared difference between the observed actual outcome values and the values predicted by the model. So, MSE = mean((observeds - predicteds)^2) and RMSE = sqrt(MSE). The lower the RMSE, the better the model.

  3. Residual Standard Error (RSE), also known as the model sigma, is a variant of the RMSE adjusted for the number of predictors in the model. The lower the RSE, the better the model. In practice, the difference between RMSE and RSE is very small, particularly for large multivariate data.

  4. Mean Absolute Error (MAE), like the RMSE, the MAE measures the prediction error. Mathematically, it is the average absolute difference between obsered and predicted out-comes, MAE = mean(abs(observeds - predicteds)) . MAE is less sensitive to outliers compared to RMSE.

The problem with the above metrics, is that they are sensible to the inclusion of additional variables in the model, even if those variables don’t have significant contribution in explaining the outcome. Put in other words, including additional variables in the model will always increase the R2 and reduce the RMSE. So, we need a more robust metric to guide the model choice.

Concerning R2, there is an adjusted version, called Adjusted R-squared, which adjusts the R2 for having too many variables in the model.

Additionally, there are four other important metrics - AIC, AICc, BIC and Mallows Cp - tha are commonly used for model evaluation and selection. These are an unbiased estimate of the model prediction error MSE. The lower these metrics, he better the model.

  1. AIC stands for (Akike’s Information Criteria), a metric developeed by the Japanese Statistician, Hirotugu Akaike, 1970. The basic idea of AIC is to penalize the inclusion of additional variables to a model. It adds a penalty that increases the error when including additional terms. The lowwer the AIC, the better the model.

  2. AICc is a version of AIC corrected for small sample sizes.

  3. BIC (or Bayesian information criteria) is a variant of AIC with a strong penalty for including additional variables to the model.

  4. Mallows Cp: Avariant of AIC developed by Colin Mallows.

Generally, most commonly used metrics, for measuring regression model quality and comparing models, are: Adjusted R2, AIC, BIC and Cp

In the following sections, we’ll sho you how to compute these above mentioned metrics.

library(modelr)
library(broom)

Attaching package: ‘broom’

The following object is masked from ‘package:modelr’:

    bootstrap
# Load the data
data("swiss")

# Inspect the data
sample_n(swiss,3)

Building regression models

We start by creating two models:

  1. Model 1, including all predictors
  2. Model 2, including all predictors except the variable Examination
model.swiss.1 <- lm(Fertility~., data = swiss)
summary(model.swiss.1)

Call:
lm(formula = Fertility ~ ., data = swiss)

Residuals:
     Min       1Q   Median       3Q      Max 
-15.2743  -5.2617   0.5032   4.1198  15.3213 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)      66.91518   10.70604   6.250 1.91e-07 ***
Agriculture      -0.17211    0.07030  -2.448  0.01873 *  
Examination      -0.25801    0.25388  -1.016  0.31546    
Education        -0.87094    0.18303  -4.758 2.43e-05 ***
Catholic          0.10412    0.03526   2.953  0.00519 ** 
Infant.Mortality  1.07705    0.38172   2.822  0.00734 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 7.165 on 41 degrees of freedom
Multiple R-squared:  0.7067,    Adjusted R-squared:  0.671 
F-statistic: 19.76 on 5 and 41 DF,  p-value: 5.594e-10

data.frame(
  R2 = rsquare(model.swiss.1, data = swiss),
  RMSE = rmse(model.swiss.1, data = swiss),
  MAE = mae(model.swiss.1, data = swiss),
  AIC = AIC(model.swiss.1),
  BIC = BIC(model.swiss.1)
)
model.swiss.2 <- lm(Fertility~. - Examination, data = swiss)

summary(model.swiss.2)

Call:
lm(formula = Fertility ~ . - Examination, data = swiss)

Residuals:
     Min       1Q   Median       3Q      Max 
-14.6765  -6.0522   0.7514   3.1664  16.1422 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)      62.10131    9.60489   6.466 8.49e-08 ***
Agriculture      -0.15462    0.06819  -2.267  0.02857 *  
Education        -0.98026    0.14814  -6.617 5.14e-08 ***
Catholic          0.12467    0.02889   4.315 9.50e-05 ***
Infant.Mortality  1.07844    0.38187   2.824  0.00722 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 7.168 on 42 degrees of freedom
Multiple R-squared:  0.6993,    Adjusted R-squared:  0.6707 
F-statistic: 24.42 on 4 and 42 DF,  p-value: 1.717e-10
data.frame(
  R2 = rsquare(model.swiss.2, data = swiss),
  RMSE = rmse(model.swiss.2, data = swiss),
  MAE = mae(model.swiss.2, data = swiss),
  AIC = AIC(model.swiss.2),
  BIC = BIC(model.swiss.2)
)
glance(model.swiss.1)
glance(model.swiss.2)

Manual computation of R2, RMSE and MAE

swiss %>% add_predictions(model.swiss.1) %>% summarise(
  R2 = cor(Fertility, pred)^2,
  MSE = mean(Fertility - pred)^2,
  RMSE = sqrt(MSE),
  MAE = mean(abs(Fertility - pred))
)
  • Comparing regression models performance
glance(model.swiss.1) %>% select(adj.r.squared, sigma, AIC, BIC, p.value)
glance(model.swiss.2) %>% select(adj.r.squared, sigma, AIC, BIC, p.value)

From the output above, it can be seen that:

  1. The two models have exactly the samed Adjusted R2 (0.67), meaning that they are equivalent in explaining the outcome, here fertility score. Additionally, they have the same amount of residual standard error (RSE or sigma = 7.17). However, the model 2 is more simple than model 1 because it incorporates less variables. All things equal, the simple model is always better in statistics.

  2. The AIC and the BIC of the model 2 are lower than those of the model1. In model comparison strategies, the model with the lowest AIC and BIC score is preferred.

  3. Finally, the F-statistic p.value of the model 2 is lower than the one of the model 1. This means that the model 2 is statistically more significant compared to model 1, which is consistent to the avove conclusion.

Note that, the RMSE and the RSE are measured in the same scale as the outcome variable. Dividing the RSE by the average value of the outcome variable will give you the prediction error rate, which should be as small as possible:

sigma(model.swiss.1)/mean(swiss$Fertility)
[1] 0.1021544

In our example the average prediction error rate is 10%

Cross-validation

Cross-validation refers to a set of methods for measuring the performance of a given predictive model on new test data sets.

The basic idea, behind cross-validation techniques, consists of dividing the data into two sets:

  1. The training set, used to train (i.e. build) the model;
  2. and the testing set (or validation set), used to test (i.e. validate) the model by estimating the prediction error.

Cross-validation is also known as a resampling method because it involves fitting the same statistical method multiple times using different subsets of the data.

  1. The different cross-validation methods for assessing model performance. We cover the following approches:

    • validation set approach (or data split)
    • Leave On Out Cross Validation
    • k-fold Cross Validation
    • Repeated k-fold Cross Validation

Model performance metrics

After building a model, we are interested in determining the accuracy of this model on predicting the outcome for new unseen observations not used to build the model. Put in other words, we want to estimate the prediction error.

To do so, the basic strategy is to:

  1. Build the model on a training data set
  2. Apply the model on a new test data set to make predictions
  3. Compute the prediction errors

Cross-validation methods

Briefly, cross-validation algorithms can be summarized as follow:

  1. Reserve a small sample of the data set
  2. Build (or train) the model using the remaining part of the data set
  3. Test the effectiveness of the model on the reserved sample of the data set. If the model works well on the test data set, then it’s good.

The following sections describe the diffferent cross-validation techniques.

1. The Validation set Approach

The validation set approach consists of randomly splitting the data into two sets: one set is used to train the model and the remaining other set is used to test the model.

The process works as follow:

  1. Build (train) the model on the training data set
  2. Appply the model to the test data set to predict the outcome of new unseen observations
  3. Quantify the prediction error as the mean squared difference between the observed and the predicted outcome values.

The example below splits the swiss data set so that 80% is used for training a linear regression model and 20% is used to evaluate the model performance.

# Split the data into training and test set 
set.seed(123)

training.samples <- swiss$Fertility %>%
  createDataPartition(p=0.8, list = FALSE)

train.swiss.data <- swiss[training.samples, ]
test.swiss.data <- swiss[-training.samples, ]

# Build the model

model.swiss.lm <- lm(Fertility ~ ., data = train.swiss.data)

# Make predictions and compute the R2, RMSE and MAE
predictions.swiss.lm <- model.swiss.lm %>%  predict(test.swiss.data)

data.frame(
  R2 = R2(predictions.swiss.lm, test.swiss.data$Fertility),
  RMSE = RMSE(predictions.swiss.lm, test.swiss.data$Fertility),
  MAE = MAE(predictions.swiss.lm, test.swiss.data$Fertility)
)

When comparing two models, the one that produces the lowest test sample RMSE is the preferred model.

The RMSE and the MAE are measured in the same scale as the outcome variable. Dividing the RMSE by the average value of the outcome variable will give you the prediction error rate, which sould be as small as possible:

RMSE(predictions.swiss.lm, test.swiss.data$Fertility)/mean(test.swiss.data$Fertility)
[1] 0.1281514

Note that, the validation set method is only useful when you have a large data set that can be partitioned. A disadvantage is that we build a model on a fraction of the data set only, possibly leaving out some interesting information about data, leading to higher bias. Therefore, the test error rate can be highly variable, depending on which observations are included in the training set and which observation are included in the validation set.

2. Leave one out cross validation - LOOCV

This method works as follows: 1. Leave out one data point and build the model on the rest of the data set 2. Test the model against the data point that is left out at step 1 and record the test error associated with the prediction 3. Repeat the process for all data points 4. Compoute the overall prediction error by taking the average of all these test error estimates recoreded at step3.

Practical example in R using the caret package:

# Define training control
train.control <- trainControl(method = "LOOCV")

# Train the model

model.loocv <- train(Fertility ~., data = swiss, method = "lm", trControl = train.control)
# Summarize the results
print(model.loocv)
Linear Regression 

47 samples
 5 predictor

No pre-processing
Resampling: Leave-One-Out Cross-Validation 
Summary of sample sizes: 46, 46, 46, 46, 46, 46, ... 
Resampling results:

  RMSE      Rsquared   MAE     
  7.738618  0.6128307  6.116021

Tuning parameter 'intercept' was held constant at a value of TRUE

The advantage of the LOOCV method is that we make use all data points reducing potential bias.

However, the process is repeated as many times as there are data points, resulting to a higher exection time when n is extreamely large.

Additionally, we test the model performance against one data, point at each iteration. This might result to higher variation in the prediction error, if some data points are outliers. So, we need a good ratio of testing data points, a solution provided by the k-fold cross-validation method.

K-fold cross-validation

The k-fold cross validation method evaluates the model performance on different subset of the training data and then calculate the average prediction error rate. The algorithm is as follow:

  1. Randomly split the data into k-subset (or k-fold) (for example 5 subsets)
  2. Reserve on subset and train the model on all other subsets
  3. Test the model on the reserved subset and record the prediction error
  4. Repeat this procss untill each of the k subsets has served as the test set.
  5. Compute the average of the k recorded errors. This is called the cross validation error serving as the performance metric for the model

K-fold cross-validation (CV) is a robust method for estimating the accuracy of a model. The most obvious advantage of k-fold CV compared to LOOCV is computational. A less obvious but potentially more important advantage of k-fold CV is that it often fives more accurate estimates of the test error rate than does LOOCV

In practice, one typically performs k-fold cross-validation using k = 5 or k = 10, as these values have been shown empirically to yield test error rate estimates that suffer neither from excessively high bias nor from very high variance.

The following example uses 10-fold cross validation to estimate the prediction error. Make sure to set seed for reproducibility.

# Define training control
set.seed(123)

train.control <- trainControl(method = "cv", number = 10)

# Train the model
model.cv <- train(Fertility~., data = swiss, method = "lm", trControl = train.control)

# Summarize the results
print(model.cv)
Linear Regression 

47 samples
 5 predictor

No pre-processing
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 43, 42, 42, 41, 43, 41, ... 
Resampling results:

  RMSE      Rsquared   MAE     
  7.379432  0.7512317  6.032731

Tuning parameter 'intercept' was held constant at a value of TRUE

4. Repeated K-fold cross-validation

The process of splitting the data into k-folds can be repeated a number of times, this is called repeated k-fold cross validation.

Bootstrap procedure

The bootstrap method is used to quantify the uncertainty with a given statistical estimator or with a predictive model.

It consists of randomly selecting a sample of n observations from the original data set. This subset, called bootstrap data set is then used to evaluate the model.

This procedure is repeated a large number of times and the standard error of the bootstrap estimate is then calculated. The results provide an indication of the variance of the model performance.

Note that, the sampling is performed with replacement, which means that the same observation can occur more than once in the boostrrap data set.

Evaluating a predictive model performance

# Define training control

train.control.bootstrap <- trainControl(method = "boot", number = 100)

model.bootstrap <- train(Fertility ~., data = swiss, method = "lm", trControl = train.control.bootstrap)

# Summarize the results
print(model.bootstrap)
Linear Regression 

47 samples
 5 predictor

No pre-processing
Resampling: Bootstrapped (100 reps) 
Summary of sample sizes: 47, 47, 47, 47, 47, 47, ... 
Resampling results:

  RMSE      Rsquared   MAE     
  8.432357  0.6048287  6.791066

Tuning parameter 'intercept' was held constant at a value of TRUE

Quantifying an estimator uncertainty and confidence intervals

The bootstrap approach can be used to quantify the uncertainty (or standard error) associated with any given statistical estimator.

For example, you might want to estimate the accuracy of the linear regression beta coefficients using bootstrap method.

The different steps are as follows:

  1. Create a simple function, model_coef(), that takes the swiss data set as well as the indices for the observations, and returns the regression coefficients.
  2. Apply the function boot_fun() to the full data set of 47 observations in order to compute the coefficients

We start by creating a function that retrns the regression model coefficeients:

model_coef <- function(data, index) {
  coef(lm(Fertility~., data = data, subset = index))
}

model_coef(swiss, 1:47)
     (Intercept)      Agriculture      Examination        Education         Catholic Infant.Mortality 
      66.9151817       -0.1721140       -0.2580082       -0.8709401        0.1041153        1.0770481 
library(boot)

Attaching package: ‘boot’

The following object is masked from ‘package:lattice’:

    melanoma
boot(swiss, model_coef, 500)

ORDINARY NONPARAMETRIC BOOTSTRAP


Call:
boot(data = swiss, statistic = model_coef, R = 500)


Bootstrap Statistics :
      original       bias    std. error
t1* 66.9151817  0.572138874 10.84264891
t2* -0.1721140 -0.004654338  0.06456320
t3* -0.2580082 -0.027889510  0.25405701
t4* -0.8709401  0.002319556  0.21736743
t5*  0.1041153 -0.002166432  0.03067023
t6*  1.0770481  0.008581232  0.43205390

In the output above,

  • original column corresponds to the regression coefficients. The associated standard errors are given in the column std.error .

  • t1 corresponds to the intercept, t2 conrresponds to *Agriculture** and so on ..

For example, it can be observe that, the standard error (SE) of the regression coefficient associated with Agriculture is 0.06

Note that, the standard errors measure the variability/accuracy of the beta coefficients. It can be used to compute the confidence intervals of the coefficients.

For example, the 95% confidence interval for a given coefficient b is defined as b+/-1.96*SE(b), where:

  • The lower limits of b = b-1.96*SE(b) = -0.172 - (1.96*0.0680) = -0.306(for Agriculture variable)
  • The Upper limits of b = b+1.96*SE(b) = -0.172 + (1.96*0.0680) = -0.03743121(for Agriculture variable)

That is, there is approximately a 95% chance that the interval[-0.306,-0.036] will contain the true value of the coefficient.

Using the standard lm() function gives a slightly different standard errors, because the linear model make some assumptions about the data:

summary(lm(Fertility ~., data = swiss))$coef
                   Estimate  Std. Error   t value     Pr(>|t|)
(Intercept)      66.9151817 10.70603759  6.250229 1.906051e-07
Agriculture      -0.1721140  0.07030392 -2.448142 1.872715e-02
Examination      -0.2580082  0.25387820 -1.016268 3.154617e-01
Education        -0.8709401  0.18302860 -4.758492 2.430605e-05
Catholic          0.1041153  0.03525785  2.952969 5.190079e-03
Infant.Mortality  1.0770481  0.38171965  2.821568 7.335715e-03

The bootstrap approach does not rely on any of these assumptions made by the linear model and so it is likely giving a more accurate estimate on the coefficients standard errors than tis the summary() function.

Model Selection

Best Subsets Regression

library(tidyverse)
library(caret)
library(leaps)

Computing best subset regression

The R function regsubsets() [leaps package] can be used to identify different best models of different sizes.

You need to specify the option nvmax, which represents the maximum number of predictors to incorporate in the mode. For example, if nvmax = 5, the function will return up to the best 5-variables model, that is, it returns the best 1-varable model, the best 2-variables model, .., the best 5-variables models.

In our example, we have only 5 predictor variables in the data. So, we’ll use nvmax = 5

models.regsubsets <- regsubsets(Fertility~., data= swiss, nvmax = 5)
summary(models.regsubsets)
Subset selection object
Call: regsubsets.formula(Fertility ~ ., data = swiss, nvmax = 5)
5 Variables  (and intercept)
                 Forced in Forced out
Agriculture          FALSE      FALSE
Examination          FALSE      FALSE
Education            FALSE      FALSE
Catholic             FALSE      FALSE
Infant.Mortality     FALSE      FALSE
1 subsets of each size up to 5
Selection Algorithm: exhaustive
         Agriculture Examination Education Catholic Infant.Mortality
1  ( 1 ) " "         " "         "*"       " "      " "             
2  ( 1 ) " "         " "         "*"       "*"      " "             
3  ( 1 ) " "         " "         "*"       "*"      "*"             
4  ( 1 ) "*"         " "         "*"       "*"      "*"             
5  ( 1 ) "*"         "*"         "*"       "*"      "*"             

Model selection criteria: Adjusted R2, Cp and BIC

The summary() function returns some metrics - Adjusted R2, Cp and BIC allowing us to identify the best overall model, where best is defined as the model that maximize the adjusted R2 and minimize the prediction errors

The adjusted R2 represents the proportion of variation, in the outcome, that are explained by variation in predictors values. The higher the adjusted R2, the better the model.

The best model, according to each of these metrics, can be extracted as follow:

res.sum <- summary(models.regsubsets)

data.frame(
  Adj.R2 = which.max(res.sum$adjr2),
  CP = which.min(res.sum$cp),
  BIC = which.min(res.sum$bic)
)
NA

There is no single correct solution to model selection, each of these criteria will lead to slightly different models.

Remembert that,

“all models are wrong, some models are useful”

So, we have different “best” models depending on which metrics we consider. We need additonal strategies.

Note also that the adjusted R2, BIC and Cp are calculated on the training data that have been used to fit the model. This means that, the model selection, using these metrics, is possibly subject to overfitting and may not perform as well when applied to new data.

A more rigorous approach is to select a models based on the prediction error computed on a new test data usin k-fold cross-validation techniques

K-fold cross-validation

Here, we’ll follow the procedure below:

  1. Extract the different model formulas from the models object
  2. Train a linear model on the formula using k-fold cross-validation(with k=5) and compute the prediction error of each model

We start by defining two helper function:

  1. get_model_formula(), allowing to access easily the formula of the models returned by the function regsubsets().
#id: model id
#object: regsubsets object
#data: data used to fit regsubsets

get_model_formula <- function(id, object){
  models <- summary(object)$which[id,-1]
  #get outcome variable
  
  form <- as.formula(object$call[[2]])
  outcome <- all.vars(form)[1]
  #Get model predictors
  predictors <- names(which(models == TRUE))
  predictors <- paste(predictors, collapse = "+")
  # Build model formula
  as.formula(paste(outcome, "~", predictors))
}
get_model_formula(3,models.regsubsets)
Fertility ~ Education + Catholic + Infant.Mortality
<environment: 0x7f9c38bcc868>
  1. get_cv_error(), to get the cross-validation (CV) error for a given model:
get_cv_error <- function (model.formula, data){
  set.seed(1)
   train.control <- trainControl(method = "cv", number = 5)
   cv <- train(model.formula, data = data, method = "lm", trControl = train.control)
   cv$results$RMSE
}

Use the above defined method to compute the prediction error

# Compute cross-validation error

model.ids <- 1:5
cv.errors <- map(model.ids, get_model_formula, models.regsubsets) %>%
  map(get_cv_error, data = swiss) %>%
  unlist()
cv.errors
[1] 9.422610 8.452344 7.926889 7.678508 7.923860
# Select the model that minimize the CV error
which.min(cv.errors)
[1] 4

It can be seen that the model with 4 variables is the best model. It has the lower prediction error. The regression coefficients of this model can be extracted as follow:

coef(models.regsubsets, 4)
     (Intercept)      Agriculture        Education         Catholic Infant.Mortality 
      62.1013116       -0.1546175       -0.9802638        0.1246664        1.0784422 

Stepwise Regression

  1. Forward selection: Which starts with no predictors in the model, Iteratively adds the most contributive predictor, and stops when the improvement is no longer statistically significant.

  2. Backwoard selection(or backward elimination): which starts with all predictors in the model (full model), iteratively removes the least contributive predictors, and stops when you have a model where all predictors are statistically significant.

  3. Stepwise selection (or sequential replacement) which is a combination of forward and backward selections. you start with no predictors, then sequentially add the most contributive predictors (like forward selection). After adding each new variable, remove any variables that no longer provide and improvement in the model fit

library(tidyverse)
library(caret)
library(leaps)

Computing setpwise regression

library(MASS)

Attaching package: ‘MASS’

The following object is masked from ‘package:dplyr’:

    select
# Fit the full model
full.model <- lm(Fertility ~., data = swiss)

# Stepwise regression model
step.model <- stepAIC(full.model, direction = "both", trace = FALSE)

summary(step.model)

Call:
lm(formula = Fertility ~ Agriculture + Education + Catholic + 
    Infant.Mortality, data = swiss)

Residuals:
     Min       1Q   Median       3Q      Max 
-14.6765  -6.0522   0.7514   3.1664  16.1422 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)      62.10131    9.60489   6.466 8.49e-08 ***
Agriculture      -0.15462    0.06819  -2.267  0.02857 *  
Education        -0.98026    0.14814  -6.617 5.14e-08 ***
Catholic          0.12467    0.02889   4.315 9.50e-05 ***
Infant.Mortality  1.07844    0.38187   2.824  0.00722 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 7.168 on 42 degrees of freedom
Multiple R-squared:  0.6993,    Adjusted R-squared:  0.6707 
F-statistic: 24.42 on 4 and 42 DF,  p-value: 1.717e-10

Penalized Regression: Ridge, Lasso and Elastic Net

Shrinkage methods

Ridge regression

Ridge regression shrinks the regression coefficients, so that varibles, with minor contribution to the outcome, have their coefficients close to zero.

The shrinkage of the coefficients is achieved by penalizing the regression model with a penalty term called L2-norm, which is the sum of the squared coefficients.

The amount of the penalty can be fine-tuned using a constant called lambda . Selecting a good value for lambda is critical

When lambda = 0, the penalty term has no effect, and ridge regression will produce the calssical least square coefficients. However, as lambda increases to infinite, the impact of the shrinkage penalty grows, and the ridge regression coefficients will get close zero.

Note that, in contrast to the ordinary least square regression, ridge regression is highly affected by the scale of the predictors. Therfore, it is better to standardize (i.e., scale) the predictors before applying the ridge regression, so that all the predictors are on the same scale.

The standardization of a predictor x, can be achieved using the formula x` = x/sd(x), where sd(x) is the stadard deviation of . The consequence of this is that, all standardized predictors will have a standard deviation of one allowing the final fit to not depend on the scale on which the predictors are measured.

One important advantage of the ridge regression, is that it still perfroms will, compared to the ordinary least square method, in a situation where you have a large multivariate data with the number of predictors (p) larger than the number of observations (n).

One disadvantage of the ridge regression is that, it will included all the predictors in the final model, unlike the stepwise regression methods, which will genrally select models that involve a reduced set of variables.

Ridge regression shrinks the coefficients towards zero, but it will not set any of them exactly to zero. The lasso regression is an alternative that overcomes this drawback.

Lasso regression

Lasso stands for Least Absolute Shrinkage and Selection Operator. It shrinks the regression coefficients toward zero by penalizing the regression model with a penalty term called L1-norm, which is the sum of the absolute coefficients.

In the case of lasso regression, the penalty has the effect of forcing some of the coefficient estimates, with a minor contribution to the model, to be ecactly equal to zero. This means that, lasso can be also seen as an alternative to the subset selection methods for performing variable selection in order to reduce the complexity of the model.

As in ridge regression, selecting a good value of lambda for the lasso is critical.

One obvious advantage of lasso regression over ridge regression, is that it produces simpler and more interpretable models that incorporate only a reduced set of the predictors. However, neither ridge regression nor the lasso will universally dominate the orther.

Generally, lasso mght perform better in a situation where some of the predictors have large coefficients, and the remaining predictors have very small coefficients.

Ridge regression will perform better when the outcome is a function of many predictors, all with coefficents of roughly equal size.

Cross-validation methods can be used for identifying which of these two techniques is better on a particular data set.

Elastic Net

Elastic Net produces a regression model that is penalized with both the L1-norm and L2-norm. The consequence of this is to effectively shrink coefficents (Like in ridge regression) and to set some coefficients to zero (as in LASSO).

Loading required R packages

library(tidyverse)
library(caret)
library(glmnet)
Loading required package: Matrix

Attaching package: ‘Matrix’

The following objects are masked from ‘package:tidyr’:

    expand, pack, unpack

Loading required package: foreach

Attaching package: ‘foreach’

The following objects are masked from ‘package:purrr’:

    accumulate, when

Loaded glmnet 2.0-18

Preparing the data

We’ll use the Boston data set [in MASS package], for predicting the median house value(mdev), in Boston Suburbs, based on multiple predictor variables.

# Load the data
data("Boston", package = "MASS")

# Split the data into training and test set
set.seed(123)
boston.samples <- Boston$medv %>%
  createDataPartition(p=0.8, list = FALSE)

train.boston.data <- Boston[boston.samples, ]
test.boston.data <- Boston[-boston.samples, ]

Computing penalized linear regression

You need to create two objects:

  • y for storing the outcome variable
  • x for holding the predictor variables.
x <- model.matrix(medv~., train.boston.data)[,-1]
# Outcome variable
y <- train.boston.data$medv

We’ll use the R function glmnet() fro computing penalized linear regression models.

glmnet(x,y, alpha = 1, lambda = NULL)

Call:  glmnet(x = x, y = y, alpha = 1, lambda = NULL) 

      Df    %Dev   Lambda
 [1,]  0 0.00000 6.908000
 [2,]  1 0.09343 6.294000
 [3,]  2 0.18670 5.735000
 [4,]  2 0.26620 5.225000
 [5,]  2 0.33220 4.761000
 [6,]  2 0.38700 4.338000
 [7,]  2 0.43240 3.953000
 [8,]  2 0.47020 3.602000
 [9,]  2 0.50150 3.282000
[10,]  3 0.53390 2.990000
[11,]  3 0.56100 2.725000
[12,]  3 0.58350 2.482000
[13,]  3 0.60210 2.262000
[14,]  3 0.61770 2.061000
[15,]  3 0.63050 1.878000
[16,]  3 0.64120 1.711000
[17,]  3 0.65010 1.559000
[18,]  3 0.65740 1.421000
[19,]  3 0.66360 1.294000
[20,]  4 0.66990 1.179000
[21,]  4 0.67560 1.075000
[22,]  4 0.68030 0.979100
[23,]  4 0.68420 0.892200
[24,]  4 0.68750 0.812900
[25,]  5 0.69080 0.740700
[26,]  5 0.69380 0.674900
[27,]  6 0.69730 0.614900
[28,]  7 0.70200 0.560300
[29,]  7 0.70610 0.510500
[30,]  8 0.70960 0.465200
[31,]  8 0.71430 0.423800
[32,]  8 0.71820 0.386200
[33,]  8 0.72140 0.351900
[34,] 10 0.72520 0.320600
[35,] 11 0.72850 0.292100
[36,] 11 0.73120 0.266200
[37,] 11 0.73350 0.242500
[38,] 11 0.73540 0.221000
[39,] 11 0.73690 0.201400
[40,] 11 0.73820 0.183500
[41,] 12 0.73950 0.167200
[42,] 12 0.74220 0.152300
[43,] 12 0.74440 0.138800
[44,] 12 0.74620 0.126500
[45,] 12 0.74760 0.115200
[46,] 12 0.74880 0.105000
[47,] 12 0.74990 0.095660
[48,] 12 0.75070 0.087160
[49,] 12 0.75140 0.079420
[50,] 12 0.75200 0.072370
[51,] 12 0.75250 0.065940
[52,] 12 0.75290 0.060080
[53,] 12 0.75320 0.054740
[54,] 12 0.75350 0.049880
[55,] 12 0.75370 0.045450
[56,] 12 0.75390 0.041410
[57,] 12 0.75410 0.037730
[58,] 12 0.75420 0.034380
[59,] 12 0.75430 0.031330
[60,] 12 0.75440 0.028540
[61,] 12 0.75450 0.026010
[62,] 12 0.75460 0.023700
[63,] 12 0.75460 0.021590
[64,] 12 0.75460 0.019670
[65,] 12 0.75470 0.017930
[66,] 12 0.75470 0.016330
[67,] 12 0.75470 0.014880
[68,] 12 0.75480 0.013560
[69,] 12 0.75480 0.012360
[70,] 12 0.75480 0.011260
[71,] 12 0.75480 0.010260
[72,] 12 0.75480 0.009346
[73,] 12 0.75480 0.008516
[74,] 12 0.75480 0.007760
  • x: matrix of predictor variables
  • y: the response or outcome variable, which is a binary variable.
  • alpha: the elasticnet mixing parameter. Allowed values include:
    • “1”: for lasso regression
    • “0”: for ridge regression
    • a valued between 0 and 1 (say 0.25) for elastic net regression.
    • lambda: a numeric value defining the amount of shrinkage. Should be specify by analyst

In penalized regression, you need to specify a constant lambda to adjust the amount of the coefficent shrinkage. The best lambda for your data, can be defined as the lambda that minimize the cross-validation prediction error rate. This can be determined automatically using the function cv.glmnet().

In the following section, we start by computing ridge, lasso and elastic net regresion models. Next, we’ll compare the different models in order to choose the best one for our data The best model is defined as the model that has the lowest prediction error, RMSE

Computing ridge regression

# Find the best lambda using cross-validation
set.seed(123)
cv <- cv.glmnet(x,y, alpha = 0)

# Display the best lambda value
cv$lambda.min
[1] 0.6907672
# Fit the final model on the training data
model.ridge <- glmnet(x,y, alpha = 0, lambda = cv$lambda.min)

# Display regression coefficients
coef(model.ridge)
14 x 1 sparse Matrix of class "dgCMatrix"
                       s0
(Intercept)  29.172942853
crim         -0.073961766
zn            0.035006472
indus        -0.055702961
chas          2.485565613
nox         -11.449222575
rm            3.976849313
age          -0.002944308
dis          -1.221350102
rad           0.147920742
tax          -0.006358355
ptratio      -0.869860292
black         0.009399874
lstat        -0.483051940
# Make predictions on the test data
x.test <- model.matrix(medv~., test.boston.data)[,-1]
predictions.ridge <- model.ridge %>% predict(x.test) %>% as.vector()

# Model performance metrics

data.frame(
  RMSE = RMSE(predictions.ridge, test.boston.data$medv),
  Rsquare = R2(predictions.ridge, test.boston.data$medv)
)

Note that by default, the function glmnet() standardizes variables so that their scales are comparable. However, the coefficients are always returned on the original scale.

Computing lasso regression

The only difference between the R code used for ridge regression is that for lasso regression you need to specify the argument alpha = 1 instead of alpha - 0

# Find the best lambda using cross-validation
set.seed(123)
cv <- cv.glmnet(x,y,alpha = 1)
# Display the best lambda value
cv$lambda.min
[1] 0.007759554
# Fit the final model on the training data
model <- glmnet(x,y, alpha = 1, lambda = cv$lambda.min)
#Display regression coefficients
coef(model)
14 x 1 sparse Matrix of class "dgCMatrix"
                       s0
(Intercept)  36.962374270
crim         -0.092549948
zn            0.048543171
indus        -0.008321076
chas          2.287418592
nox         -16.832483690
rm            3.810180182
age           .          
dis          -1.598716155
rad           0.286797839
tax          -0.012456750
ptratio      -0.950997301
black         0.009652895
lstat        -0.528739166
# Make predictions on the test data
x.laso.test <- model.matrix(medv ~., test.boston.data)[,-1]
predictions.lasso <- model %>% predict(x.test) %>% as.vector()

# Model performance metrics
data.frame(
  RMSE = RMSE(predictions.lasso, test.boston.data$medv),
  Rsquare = R2(predictions.lasso, test.boston.data$medv)
)

Computing elastic net regression

The elastic net regression can be easily computed using the caret workflow, which invokes the glmnet package.

We use caret to automatically select the best tuning parameters alpha and lambda. the caret packages tests a range of possible alpha and lambda values, then selects the best values fro lambda and alpha, resulting to a final modl that is an elastic net modle.

Here, we’ll test the combination of 10 different values for alpha and lambda. This is specified using the option tuneLength.

The best alpha and lambda values are those values that minimize the cross-validation error

# # Build the model using the training set
# set.seed(123)
# model.elastic <- train(
#   medv ~., data = train.boston.data, method = "glmnet",
#   trcontrol = trainControl("cv", number = 5),
#   tuneLength = 10
# )
# 
# # Best tuning parameter
# model.elastic$bestTune

Principal Component and Partial Least Squares Regression

Principal component regression

The principal component regression(PCR) first applies Principal Component Analysis on the data set to summarize the original predictor variables into few new variables also known as principal component (PCs), which are a linear combination of the original data.

These PCs are then used to build the linear regression model. The number of principal components, to incorporate in the model, is chosen by corss-validation (cv). Note that, PCR is suitable when the data set contains highly correlated predictors.

Partial least squares regression

A possible drawback of PCR is that we have no guarantee that the selected principal components are associated with the outcome. Here, the selection of the principal components to incorporate in the model is not supervised by the outcome variable.

An alternative to PCR is the Partial Least Squares (PLS) regression, which identifies new principal components that not only summarizes the original predictors, but also that are related to the outcome. These components are then used to fit the regression model. So compared PCR, PLS uses a dimension reduction strategy that is supervised by the outcome.

Like PCR, PLS is convenient for data with highly-correlated predictors. The number of PCs used in PLS is generally chosen by cross-validation. Predictors and the outcome variables should be generally standardized, to make the variables comparable.

Loading required R packages

  • tidyverse for easy data manipulation and visualization
  • caret for easy machine learning workflow
  • pls, for computing PCR and PLS
library(tidyverse)
library(caret)
library(pls)

Attaching package: ‘pls’

The following object is masked from ‘package:caret’:

    R2

The following object is masked from ‘package:stats’:

    loadings

Preparing the data

We’ll use the boston data set

Computing principal component regression

# Build the model on training set

set.seed(123)

model.pc <- train(
  medv~., data = train.boston.data, method = "pcr",
  scale = TRUE,
  trControl = trainControl("cv", number = 10),
  tuneLength = 10
)

# Plot model RMSE vs different values of components

plot(model.pc)


#Print the best tunning parameter ncomp that 
# minimize the cross-validation error, RMSE
model.pc$bestTune
summary(model.pc$finalModel)
Data:   X dimension: 407 13 
    Y dimension: 407 1
Fit method: svdpc
Number of components considered: 5
TRAINING: % variance explained
          1 comps  2 comps  3 comps  4 comps  5 comps
X           47.48    58.40    68.00    74.75    80.94
.outcome    38.10    51.02    64.43    65.24    71.17
# Make predictions
predictions.pc <- model.pc %>% predict(test.boston.data)
# Model performance metrics

data.frame(
  RMSE = caret::RMSE(predictions.pc, test.boston.data$medv),
  Rsquare = caret::R2(predictions.pc, test.boston.data$medv)
)

The plot shows the prediction error made by the model acccording to the number of principal components incorporated in the model.

Our analysis shows that, choosing five principal components (ncomp = 5) gives the smallest prediction error RMSE.

The summary() function also provides the percentage of variance explained in the predictors(x) and in the outcome (medv) using different numbers of components.

For example, 80.94% of the variation (or information) contained in the predictors are captured by 5 components (ncomp = 5). Additionally, setting ncomp = 5, captures 71% of the information in the outcome variable (medv), which is good

Taken together, cross-validation identifies ncomp - 5 as the optimal number of PCs that minimize the prediction error (RMSE) and explains enough variation in the predictors and in the outcome

Computing partial least squares

# Build the model on training set
set.seed(123)
model.pls <- train(
  medv~., data = train.boston.data, method = "pls",
  scale = TRUE,
  trControl = trainControl("cv", number = 10),
  tuneLength = 10
)

# Plot model RMSE vs Different values of components
plot(model.pls)

model.pls$bestTune
summary(model.pls$finalModel)
Data:   X dimension: 407 13 
    Y dimension: 407 1
Fit method: oscorespls
Number of components considered: 9
TRAINING: % variance explained
          1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps  8 comps  9 comps
X           46.19    57.32    64.15    69.76    75.63    78.66    82.85    85.92    90.36
.outcome    50.90    71.84    73.71    74.71    75.18    75.35    75.42    75.48    75.49
#Make predictions
predictions.pls <- model.pls %>% predict(test.boston.data)

# Model performance metrics

data.frame(
  RMSE = caret::RMSE(predictions.pls, test.boston.data$medv),
  Rsquare = caret::R2(predictions.pls, test.boston.data$medv)
)

The optimal number of principal components included in the PLS model is 9. This captures 90% of the variation in the predictors and 75% of the variation in the outcome variable (medv).

In our example, the cross-validation error RMSE obtained with the PLS model is lower than the RMSE obtained using the PCR method. So, the PLS model is the best model, for explaining our data, compaed to the PCR model.

Classification

In this part, we’ll comver the follwing topics:

Most of the classification algorithms computes the probability of belonging to a given class.

Observations are then assigned to the class that have the highest probability score.

Generaly, you need to decide a probability cutoff above which you consider the an observation as belonging to a given class.

Dataset will be using

  1. PimaIndiansDiabetes2 data set

The Pima Indian Diabetes data set is available in the mlbench package. It will be used for binary classification.

# Load the data set
data("PimaIndiansDiabetes2", package = "mlbench")

# Inspect the data
head(PimaIndiansDiabetes2,4)
str(PimaIndiansDiabetes2)
'data.frame':   768 obs. of  9 variables:
 $ pregnant: num  6 1 8 1 0 5 3 10 2 8 ...
 $ glucose : num  148 85 183 89 137 116 78 115 197 125 ...
 $ pressure: num  72 66 64 66 40 74 50 NA 70 96 ...
 $ triceps : num  35 29 NA 23 35 NA 32 NA 45 NA ...
 $ insulin : num  NA NA NA 94 168 NA 88 NA 543 NA ...
 $ mass    : num  33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 NA ...
 $ pedigree: num  0.627 0.351 0.672 0.167 2.288 ...
 $ age     : num  50 31 32 21 33 30 26 29 53 54 ...
 $ diabetes: Factor w/ 2 levels "neg","pos": 2 1 2 1 2 1 2 1 2 2 ...

The data contains 768 individuals (female) and 9 clinical variables for predictin the probability of individuals in being diabete-positive or negative:

Column Names Description
pregnant Number of times pregnant
glucose Plasma glucose concentration (glucose tolerance test)
pressure Diastolic blood pressure (mm Hg)
triceps Triceps skin fold thickness (mm)
insulin 2-Hour serum insulin (mu U/ml)
mass Body mass index (weight in kg/(height in m)^2)
pedigree Diabetes pedigree function
age Age (years)
diabetes Class variable (test for diabetes)

Performing the following steps might improve the accuracy of your model

set.seed(123)

training.Pima.samples <- PimaIndiansDiabetes2$diabetes %>%
  createDataPartition(p = 0.8, list = FALSE)

train.pima.data <- PimaIndiansDiabetes2[training.Pima.samples, ]
test.pima.data <- PimaIndiansDiabetes2[-training.Pima.samples, ]

Computing logistic regression

# Fit the model
model.lr.pima <- glm(diabetes~., data = train.pima.data, family = binomial)

# Summarize the model
summary(model.lr.pima)

Call:
glm(formula = diabetes ~ ., family = binomial, data = train.pima.data)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.6751  -0.6666  -0.3588   0.6581   2.5650  

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept) -10.196902   1.369247  -7.447 9.54e-14 ***
pregnant      0.082032   0.061219   1.340  0.18025    
glucose       0.036517   0.006381   5.723 1.05e-08 ***
pressure      0.001333   0.013053   0.102  0.91863    
triceps       0.008425   0.018649   0.452  0.65145    
insulin      -0.001219   0.001441  -0.845  0.39784    
mass          0.081434   0.030448   2.675  0.00748 ** 
pedigree      1.186528   0.470790   2.520  0.01173 *  
age           0.030886   0.020337   1.519  0.12884    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 402.38  on 314  degrees of freedom
Residual deviance: 280.83  on 306  degrees of freedom
  (300 observations deleted due to missingness)
AIC: 298.83

Number of Fisher Scoring iterations: 5
## Make predictions

probabilities <- model.lr.pima %>% predict(test.pima.data, type = "response")
predicted.classes <- ifelse(probabilities > 0.5, "pos", "neg") 
# predicted.classes
# Model accuracy
mean(predicted.classes == test.pima.data$diabetes)
[1] NA

Simple logistic regression

The simple logistic regression is used to predict the probability of class membership based on one single predictor variable.

The following R code builds a model to predict the probability of being diabetes-positive based on the plasma glucose concentration:

model.logit.simple <- glm(diabetes ~ glucose, data = train.pima.data, family = binomial)
summary(model.logit.simple)

Call:
glm(formula = diabetes ~ glucose, family = binomial, data = train.pima.data)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.1322  -0.7882  -0.5157   0.8641   2.2706  

Coefficients:
             Estimate Std. Error z value Pr(>|z|)    
(Intercept) -5.555585   0.482107  -11.52   <2e-16 ***
glucose      0.039188   0.003697   10.60   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 790.12  on 610  degrees of freedom
Residual deviance: 637.56  on 609  degrees of freedom
  (4 observations deleted due to missingness)
AIC: 641.56

Number of Fisher Scoring iterations: 4

The output above shows the estimate of the regression beta coefficients and their significance levels. The intercept (b0) is -5.55 and the coefficient of glucose variable is 0.039.

The logistic equation can be written as p = exp(-5.55 + 0.039*glucose)/[1+exp(-5.55+0.039*glucose)]. Using this formula, for each new glucose plasma concentration value, you can predict the probability of the individuals in bein diabetes positive.

Predictions can be easily made using the function predict(). Use the option type = “response” to directly obtain the probabilities

newdata <- data.frame(glucose = c(20,190))
probabilities <- model.logit.simple  %>% predict(newdata, type = "response")
predicted.classes <- ifelse(probabilities > 0.5, "pos", "neg")
predicted.classes
    1     2 
"neg" "pos" 
train.pima.data %>%
  mutate(prob = ifelse(diabetes == "pos", 1, 0)) %>%
  ggplot(aes(glucose,prob)) +
  geom_point(alpha = 0.2) +
  geom_smooth(method = "glm", method.args = list(family = "binomial"))+
  labs(
    title = "Simple Logistic Regression Model",
    x = "Plasma Glucose Concentration",
    y = "Probability of being diabetes-pos"
  )

plot(model.logit.simple)

Stepwise Logistic Regression

Stepwise logistic regression consists of automatically selecting a reduced number of predictor variables for building the best performing logistic regression model.

Data set: PimaIndiansDiabetes2

Quick start R code

library(MASS)
# Fit the model

removed.missing.data <- na.omit(train.pima.data)
model <- glm(diabetes ~ ., data = removed.missing.data , family = binomial) %>% stepAIC(trace = FALSE)

summary(model)

Call:
glm(formula = diabetes ~ glucose + mass + pedigree + age, family = binomial, 
    data = removed.missing.data)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.8024  -0.6631  -0.3716   0.6617   2.5631  

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept) -10.081719   1.202909  -8.381  < 2e-16 ***
glucose       0.033770   0.005536   6.101 1.06e-09 ***
mass          0.083808   0.022726   3.688 0.000226 ***
pedigree      1.110791   0.456960   2.431 0.015064 *  
age           0.051063   0.014602   3.497 0.000471 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 402.38  on 314  degrees of freedom
Residual deviance: 283.63  on 310  degrees of freedom
AIC: 293.63

Number of Fisher Scoring iterations: 5
removed.missing.data.from.test <- na.omit(test.pima.data)

probabilities.step <- model %>% predict(removed.missing.data.from.test, type = "response")
predicted.classes.step.logit <- ifelse(probabilities.step > 0.5, "pos", "neg")

#Model Accuracy
mean(predicted.classes.step.logit == removed.missing.data.from.test$diabetes)
[1] 0.7922078
df <- data.frame("Predicted" = predicted.classes.step.logit)
(df$Predicted == test.pima.data$diabetes)
longer object length is not a multiple of shorter object lengthlonger object length is not a multiple of shorter object length
  [1]  TRUE FALSE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE FALSE  TRUE FALSE FALSE FALSE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE FALSE
 [22]  TRUE  TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE  TRUE
 [43] FALSE FALSE FALSE  TRUE FALSE  TRUE FALSE  TRUE  TRUE FALSE  TRUE FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE
 [64]  TRUE FALSE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE FALSE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE  TRUE FALSE
 [85] FALSE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE
[106] FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE FALSE  TRUE FALSE  TRUE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE
[127]  TRUE  TRUE FALSE  TRUE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE FALSE
[148] FALSE  TRUE FALSE  TRUE FALSE  TRUE

Penalized logistic regression

Penalized logistic regression imposes a penalty to the logistic model for having too many variables. This results in shrinking the coefficients of the less contributive variables toward zero. This is also know as regularization. The most commonly used penalized regression include

  • Ridge regression: variables with minor contribution have their coefficients close to zero. However, all the variables are incorporated in the model. This is useful when all variables need to be incorporated in the model according to domain knowledge.

  • lasso regression: the coefficients of some less contributive variables aer forced to be exactly zero. Only the most significant variables ae kept in the final model.

  • elastic net regression: the combination of ridge and lasso regression. It shrinks some coefficients toward zero (like ridge regerssion) and set some coefficients to exactly zero (like lasso regression)

Required packages

library(tidyverse)
library(caret)
library(glmnet)

Preparing the data

Data set: PimaIndiansDiabetes2

# Load the data and remove NAs
Pima.Indians.data <- na.omit(PimaIndiansDiabetes2)
# Inspect the data
sample_n(Pima.Indians.data, 3)

# Split the data into training and test set
set.seed(123)

training.samples <- Pima.Indians.data$diabetes %>%
  createDataPartition(p = 0.8, list = FALSE)

train.pima.indians <- Pima.Indians.data[training.samples, ]
test.pima.indians <- Pima.Indians.data[-training.samples, ]

Additional data preparation

The R function model.matrix() help to create the matrix of predictors and also automatically converts categorical predictors to appropriate dummy variables, which is required for the glmnet() function.

# Dummy code categorical predictor variables
x <- model.matrix(diabetes ~., train.pima.indians)[,-1]

# Convert the outcome (class) to a numerical variable
y <- ifelse(train.pima.indians$diabetes == "pos", 1, 0)

We’ll use the R function glmnet() for computing penalized logistic regression.

The simplified format is as follow:

library(glmnet)
glmnet(x, y, family = "binomial", alpha = 1, lambda = NULL)

Call:  glmnet(x = x, y = y, family = "binomial", alpha = 1, lambda = NULL) 

      Df      %Dev    Lambda
 [1,]  0 3.147e-15 0.2480000
 [2,]  1 3.690e-02 0.2259000
 [3,]  1 6.743e-02 0.2059000
 [4,]  1 9.292e-02 0.1876000
 [5,]  1 1.144e-01 0.1709000
 [6,]  1 1.325e-01 0.1557000
 [7,]  1 1.479e-01 0.1419000
 [8,]  1 1.610e-01 0.1293000
 [9,]  1 1.722e-01 0.1178000
[10,]  2 1.851e-01 0.1073000
[11,]  2 1.970e-01 0.0978000
[12,]  2 2.071e-01 0.0891100
[13,]  2 2.158e-01 0.0812000
[14,]  2 2.233e-01 0.0739800
[15,]  3 2.314e-01 0.0674100
[16,]  5 2.400e-01 0.0614200
[17,]  5 2.484e-01 0.0559600
[18,]  5 2.558e-01 0.0509900
[19,]  5 2.620e-01 0.0464600
[20,]  5 2.674e-01 0.0423400
[21,]  5 2.721e-01 0.0385700
[22,]  6 2.762e-01 0.0351500
[23,]  6 2.798e-01 0.0320300
[24,]  6 2.829e-01 0.0291800
[25,]  6 2.856e-01 0.0265900
[26,]  6 2.879e-01 0.0242300
[27,]  6 2.898e-01 0.0220700
[28,]  6 2.915e-01 0.0201100
[29,]  6 2.929e-01 0.0183300
[30,]  6 2.941e-01 0.0167000
[31,]  6 2.951e-01 0.0152100
[32,]  6 2.959e-01 0.0138600
[33,]  6 2.967e-01 0.0126300
[34,]  7 2.975e-01 0.0115100
[35,]  7 2.984e-01 0.0104900
[36,]  7 2.993e-01 0.0095550
[37,]  7 2.999e-01 0.0087060
[38,]  7 3.005e-01 0.0079330
[39,]  7 3.010e-01 0.0072280
[40,]  7 3.014e-01 0.0065860
[41,]  7 3.018e-01 0.0060010
[42,]  8 3.022e-01 0.0054680
[43,]  8 3.025e-01 0.0049820
[44,]  8 3.028e-01 0.0045390
[45,]  8 3.030e-01 0.0041360
[46,]  8 3.032e-01 0.0037690
[47,]  8 3.034e-01 0.0034340
[48,]  8 3.035e-01 0.0031290
[49,]  8 3.037e-01 0.0028510
[50,]  8 3.038e-01 0.0025980
[51,]  8 3.038e-01 0.0023670
[52,]  8 3.039e-01 0.0021570
[53,]  8 3.040e-01 0.0019650
[54,]  8 3.040e-01 0.0017900
[55,]  8 3.040e-01 0.0016310
[56,]  8 3.041e-01 0.0014860
[57,]  8 3.041e-01 0.0013540
[58,]  8 3.041e-01 0.0012340
[59,]  8 3.042e-01 0.0011240
[60,]  8 3.042e-01 0.0010250
[61,]  8 3.042e-01 0.0009336
[62,]  8 3.042e-01 0.0008506
[63,]  8 3.042e-01 0.0007750
set.seed(123)
cv.lasso <- cv.glmnet(x,y, alpha = 1, family = "binomial")
# cv.lasso
# summary(cv.lasso)
# Fit the final model on the training data
model.logit.lasso <- glmnet(x,y, alpha = 1, family = "binomial", lambda = cv.lasso$lambda.min)

# Display regression coefficients
coef(model.logit.lasso)
9 x 1 sparse Matrix of class "dgCMatrix"
                       s0
(Intercept) -8.6158632778
pregnant     0.0350263984
glucose      0.0369158677
pressure     .           
triceps      0.0164757684
insulin     -0.0003928031
mass         0.0304940618
pedigree     0.7854910473
age          0.0362782651
# Make predictions on the test data
x.test <- model.matrix(diabetes~., test.pima.indians)[,-1]
probabilities.logit.lasso <- model.logit.lasso %>% predict(newx = x.test)
predict.classess.logit.lasso <- ifelse(probabilities.logit.lasso > 0.5, "pos", "neg")

#Model accuracy
observed.classes <- test.pima.indians$diabetes
mean(predict.classess.logit.lasso == observed.classes)
[1] 0.7692308
plot(cv.lasso)

The plot displays the cross-validation error according to the log of lambda. The left dashed vertical line indicates that the log of the optimal value of lambda is approximately -5, which is the one that minimizes the prediction error. This lambda value will give the most accurate model. The exact value of lambda can be viewed as follow:

cv.lasso$lambda.min
[1] 0.008706319
cv.lasso$lambda.1se
[1] 0.06740987

Using lambda.min as the best lambda, gives the following regression coefficients:

coef(cv.lasso, cv.lasso$lambda.min)
9 x 1 sparse Matrix of class "dgCMatrix"
                        1
(Intercept) -8.6156146833
pregnant     0.0350758913
glucose      0.0369156360
pressure     .           
triceps      0.0164842605
insulin     -0.0003924262
mass         0.0304848446
pedigree     0.7855063693
age          0.0362650459
coef(cv.lasso, cv.lasso$lambda.1se)
9 x 1 sparse Matrix of class "dgCMatrix"
                       1
(Intercept) -4.657500725
pregnant     .          
glucose      0.026284471
pressure     .          
triceps      0.001905497
insulin      .          
mass         .          
pedigree     .          
age          0.017338402

Logistic Regression Assumptions and Diagnostics

The logistic regression method assumes that:

  • The outcome is a binary or dichotomous variable like yes vs no, positive vs negative, 1 vs 0.

  • there is alineare relationship between the logit of the outcome and each predictor variables. Recall that the outcome and each predictor variables. Recall that the logit function is logit(p) = log(p/(1-p)), where p is the probabilities of the outcome

  • There is no influential value (extreame values or outliers) in the continuous predictors

  • There is no high intercorrelation (i.e. multicollinearity) among the predictors.

Loading reequired R packages

library(tidyverse)
library(broom)
theme_set(theme_classic())
PimaIndiansDiabetes2.cleaned <- na.omit(PimaIndiansDiabetes2)
# Fit the logistic regression model
model.pima.logit2 <- glm(diabetes~., data = PimaIndiansDiabetes2.cleaned, family = binomial)
# Predict the probability (p) of diiabete positivity
probabilities.pima.logit <- predict(model.pima.logit2, type = "response")
predicted.pima.classes <- ifelse(probabilities.pima.logit > 0.5, "pos", "neg")
head(predicted.pima.classes)
    4     5     7     9    14    15 
"neg" "pos" "neg" "pos" "pos" "pos" 
# train.pima.indians
mydata <- PimaIndiansDiabetes2.cleaned  %>%
  dplyr::select_if(is.numeric)
predictors <- colnames(mydata)
# Bind the logit and tidying the data for plot
model.pima.logit <- mydata %>%
  mutate(logit = log(probabilities.pima.logit/(1-probabilities.pima.logit))) %>%
  gather(key = "predictors", value = "predictor.value", -logit)
ggplot(model.pima.logit, aes(logit, predictor.value)) +
  geom_point(size = 0.5, alpha = 0.5) +
  geom_smooth(method = "loess") +
  theme_bw() +
  facet_wrap(~predictors, scales = "free_y")

Influential values

Influential values are extreme individual data points that can alter the quaity of the logistic regression model. The most extreme values in the data can be examined by visualizing the Cook’s distance values.

Here we label the top 3 largest values:

plot(model.pima.logit2, which = 4, id.n = 3)

Note that, not all outliers are influential observations. To check kwhether the data contains potential influential observations, the standardized residual error can be inspected. Data points with an absolute standardized residuals above 3 represent possible outliers and may deserve closer attention.

The following R code computes the standardized residuals (.std.resid) and the Cook’s distance (.cooksd) using the R function augment()[broom package].

# Extract model results

model.pima.logit.data <- augment(model.pima.logit2) %>%
  mutate(index = 1:n())

The data for the top 3 largest values, according to the Cook’s distance, can be displayed as follow:

model.pima.logit.data %>% top_n(3, .cooksd)
NA

Plot the standardized residuals:

ggplot(model.pima.logit.data, aes(index, .std.resid)) +
  geom_point(aes(color = diabetes), alpha = 0.5) +
  theme_bw()

Filter potential influential data points with abs(.std.res) > 3

model.pima.logit.data %>% filter(abs(.std.resid) > 3)

There is no influential observation in our data.

When we have outliers in a continuous predictor, potential solutions include:

  • Removing the concerned records
  • Transform the data into log scale
  • Use non parameteric methods

Multicollinearity

Multicollinearity corresponds to a situation where the data contain highly correlated predictor variables.

Multicollinearity is an important issue in regression analysis and should be fixed by removing the concerned variables. It can be assessed using the R function vif()

car::vif(model.pima.logit2)
pregnant  glucose pressure  triceps  insulin     mass pedigree      age 
1.892387 1.378937 1.191287 1.638865 1.384038 1.832416 1.031715 1.974053 

As a rule of thumb, a VIF value that exceeds 5 or 10 indicates a problematic amount of collinearity. In our example there is no collinearity: all variables have a value of VIF well below 5.

Multinomial logistic regression

The multinomial logistic regressin is an extension of the logistic regression for multclass classifcation tasks. It is used when the outcome involves more than two classes.

Loading required R packages

library(tidyverse)
library(caret)
library(nnet)

Attaching package: ‘nnet’

The following object is masked from ‘package:mgcv’:

    multinom

Preparing the data

We’ll use the iris data set

# Load the data
data("iris")
# Inspect the data
sample_n(iris,3)

# Split the data into training and test set
set.seed(123)
training.samples.iris <- iris$Species %>% createDataPartition(p=0.8, list = FALSE)
train.iris.data <- iris[training.samples.iris,]
test.iris.data <- iris[-training.samples.iris,]

Computing multinomial logistic regression

# Fit the model
model <- nnet::multinom(Species ~., data = train.iris.data)
# weights:  18 (10 variable)
initial  value 131.833475 
iter  10 value 13.707358
iter  20 value 5.548255
iter  30 value 5.196272
iter  40 value 4.985881
iter  50 value 4.978698
iter  60 value 4.970034
iter  70 value 4.968625
iter  80 value 4.967145
iter  90 value 4.967125
iter 100 value 4.967097
final  value 4.967097 
stopped after 100 iterations
# Summarize the model
summary(model)
Call:
nnet::multinom(formula = Species ~ ., data = train.iris.data)

Coefficients:
           (Intercept) Sepal.Length Sepal.Width Petal.Length Petal.Width
versicolor    17.29267    -5.643165   -8.391525     14.61331   -2.091772
virginica    -21.99694    -6.650591  -14.538985     22.01169   14.158731

Std. Errors:
           (Intercept) Sepal.Length Sepal.Width Petal.Length Petal.Width
versicolor    43.11205     119.7240    198.5332     91.66177    59.26249
virginica     43.68199     119.7252    198.5908     91.81816    59.59031

Residual Deviance: 9.934194 
AIC: 29.93419 
# Make predictions

predicted.species <- model %>% predict(test.iris.data)
head(predicted.species)
[1] setosa setosa setosa setosa setosa setosa
Levels: setosa versicolor virginica
# Model accuracy
mean(predicted.species == test.iris.data$Species)
[1] 0.9666667

Discriminant Analysis

The following discriminant analysis methods will be described:

  • Linear discriminant analysis (LDA): Uses linear combinations of predictors to predict the class of a given observation. Assumes that the predictor variables (p) are normally distributed and the classes have identical variances (for univariate analysis, p = 1) or identical covariance matrices (for multvariate analysis, p > 1).

  • Quadratic discriminant analysis (QDA): More flexible than LDA. Here, there is no assumption that the covariance matrix of classes is the same

  • Mixture discriminant analysis (MDA): Each class is assumed to be a Gaussian mixture of subclasses.

  • Flexible Discriminant Analysis (FDA): Non-linear combination of predictors is used such as splines.

  • Regularized discriminant analysis (RDA): Regularization (or shrinkage) improves the estimate of the covariance matrices in situation where the number of predictors is larger than the number of samples in the training data. This leads to an improvement of the discriminant analysis.

Loading required R packages

library(tidyverse)
library(caret)
theme_set(theme_classic())

Preparing the data

  1. Split the data
# Lodd the data
data("iris")
# Split the data into training and test set
set.seed(123)
training.samples.iris <- iris$Species %>%
  createDataPartition(p=0.8, list = FALSE)

train.data.iris <- iris[training.samples.iris, ]
test.data.iris <- iris[-training.samples.iris, ]
  1. Normalize the data. Categorical
# Estimate preprocessing parameters
preproc.pram <- train.data.iris %>%
  preProcess(method = c("center", "scale"))

# Transform the data using the estimated parameters
train.transformed <- preproc.pram %>% predict(train.data.iris)
test.transformed <- preproc.pram %>% predict(test.data.iris)

Before performing LDA, consider: * Inspecting the univariate distribution of each variable and make sure that they are normally distribute. If not, you can transform them using log and root for exponential distributions and Box-Cox for skewed distributions.

*removing outliers from your data and standardize the variables to make their scale comparable.

R code

library(MASS)

# Fit the model
model.lda <- lda(Species ~., data = train.transformed)
# Make predictions
predictions <- model.lda %>% predict(test.transformed)

# Model accuracy
mean(predictions$class == test.transformed$Species)
[1] 1
model.lda
Call:
lda(Species ~ ., data = train.transformed)

Prior probabilities of groups:
    setosa versicolor  virginica 
 0.3333333  0.3333333  0.3333333 

Group means:
           Sepal.Length Sepal.Width Petal.Length Petal.Width
setosa       -1.0120728   0.7867793   -1.2927218  -1.2496079
versicolor    0.1174121  -0.6478157    0.2724253   0.1541511
virginica     0.8946607  -0.1389636    1.0202965   1.0954568

Coefficients of linear discriminants:
                    LD1         LD2
Sepal.Length  0.9108023  0.03183011
Sepal.Width   0.6477657  0.89852536
Petal.Length -4.0816032 -2.22724052
Petal.Width  -2.3128276  2.65441936

Proportion of trace:
   LD1    LD2 
0.9905 0.0095 
summary(model.lda)
        Length Class  Mode     
prior    3     -none- numeric  
counts   3     -none- numeric  
means   12     -none- numeric  
scaling  8     -none- numeric  
lev      3     -none- character
svd      2     -none- numeric  
N        1     -none- numeric  
call     3     -none- call     
terms    3     terms  call     
xlevels  0     -none- list     

LDA determines group means and computes, for each individual, the probability of belonging to the different groups. The individual is then affected to the group with the highest probability score.

The lda() outputs contain the following elements: * prior probabilities of groups: the proportion of training observation in each group. For example, there are 33% of the training observations in the setosa group * Group means: group center of gravity. Shows the mean of each variable in each group. * Coefficients of linear discriminants: Shows the linear combination of predictor variables that are used to form the LDA decisionrule. For example

LD1 = 0.01*Sepal.Length + 0.64*Sepal.Width - 4.08*Petal.Length = 2.3*Petal.Width LD2 = 0.03*Sepal.Length + 0.89*Sepal.Width - 2.2*Petal.Length - 2.6*Petal.Width

Using the function Plot() produces plots of the linear discriminants, obtained by computing LD1 and LD2 for each of the training observations

plot(model.lda)

lda.data <- cbind(train.transformed, predict(model.lda)$x)

ggplot(lda.data, aes(LD1, LD2)) +
  geom_point(aes(color = Species))

It can be seen that, our model correctly classified 100% of observations, which is excellent.

Note that, by default, the probability cutoff used to decide group-membership is 0.5

In some situation, you might want to increase the precision of the model. In this case you can fine-tune the model by adjusting the posterior probability cutoff. For example, you can increase or lower the cutoff

Variable selection:

Note that, if the predictor variables are standardized before computing LDA, the discriminator weights can be used as measures of variable importance for feature selection

Quardratic discriminant analysis - QDA

QDA is little bit more flexible than LDA, in the sense that it does not assumes the equality of variance/covariance. In other words, for QDA the covariance matrix can be different for each class.

LDA tends to be a better than QDA then you have a small training set.

In contrast, QDA is recommended if the training set is very large, so that the variance of the classifier is not a major issue, or if the assumption of a common covariance matrix for the K classes if clearly untenable

QDA can be computed using the R function qda() [MASS package]

library(MASS)
#Fit the model
model.qda <- qda(Species~., data = train.transformed)
model.qda
Call:
qda(Species ~ ., data = train.transformed)

Prior probabilities of groups:
    setosa versicolor  virginica 
 0.3333333  0.3333333  0.3333333 

Group means:
           Sepal.Length Sepal.Width Petal.Length Petal.Width
setosa       -1.0120728   0.7867793   -1.2927218  -1.2496079
versicolor    0.1174121  -0.6478157    0.2724253   0.1541511
virginica     0.8946607  -0.1389636    1.0202965   1.0954568
# Make predictions
predictions.qda <- model.qda %>% predict(test.transformed)

# Model accuracy
mean(predictions.qda$class == test.transformed$Species)
[1] 1

Mixture discriminant analysis

The LDA classifier assumes that each class comes from a single normal (or Gaussian ) distribution.

This is too restrictive.

For MDA, ther are classes, and each class is assumed to be a Gaussian mixture of subclasses, where each data point has a probability of belonging to each class. Equality of covariance matrix, among classes, is still assumed

library(mda)
Loading required package: class
Loaded mda 0.4-10
model.mda <- mda(Species~., data = train.transformed)
model.mda
Call:
mda(formula = Species ~ ., data = train.transformed)

Dimension: 5 

Percent Between-Group Variance Explained:
    v1     v2     v3     v4     v5 
 96.77  99.63  99.86 100.00 100.00 

Degrees of Freedom (per dimension): 5 

Training Misclassification Error: 0.025 ( N = 120 )

Deviance: 14.005 
# Make predictions

predicted.classes.mda <- model.mda %>% predict(test.transformed)

# Model accuracy
mean(predicted.classes.mda == test.transformed$Species)
[1] 1

Flexible discriminant analysis - FDA

FDA is a flexible extension of LDA that uses non-linear combinations of predictors such as splines. FDA is useful to model multivariate non-normality or non-linear relationships among variables within each group, alowing for a more accurate classification

library(mda)
# Fit the model
model.fda <- fda(Species~., data = train.transformed)
model.fda
Call:
fda(formula = Species ~ ., data = train.transformed)

Dimension: 2 

Percent Between-Group Variance Explained:
    v1     v2 
 99.05 100.00 

Degrees of Freedom (per dimension): 5 

Training Misclassification Error: 0.025 ( N = 120 )
predicted.classes.fda <- model.fda %>% predict(test.transformed)
#Model accuracy
mean(predicted.classes.fda == test.transformed$Species)
[1] 1

Regularized discriminant analysis

RDA builds a classification rule by regularizing the group covariance matrices allowing a more robust model against multicollinearity in the data. This might be very useful for a large multivariate data set containing highly correlated predictors.

Regularized discriminant analysis is a kind of a trade-off between LDA and QDA. Recall that, in LDA we assume equality of covariance matrix for all of the classes. QDA assumes different covariance matrices for all the classes. QDa assumes different covariance matrices for all the classes. Regularized discriminant analysis is an intermediate between LDA and QDA.

RDA shrinks the separte covariances of QDA toward a common covariance as in LDA. This improves the estimate of the covariance matrices in situations where the number of predictors is larger thant the number of samples in the training data, potentially leading to an improvement of the model accuracy.

library(klaR)
# Fit the model
model.rda <- rda(Species~., data = train.transformed)

#  Make predictions
predictions.rda <- model.rda %>% predict(test.transformed)

# Model accuracy

mean(predictions.rda$class == test.transformed$Species)
[1] 1

Naive Bayes Classifier

The Naive Bayes Classifier is a simple and powerful method that can be used for binary and multiclass classification problems.

Naive Bayes classifier predicts the class membership probability of observations using Bayes theorem, which is based on conditional probability, that is the probability of something to happen, given that something else has already occured.

Will use PimaIndiansDiabetes2 data set

Computing Naive Bayes

library(klaR)
# Fit the model
model.naive <- NaiveBayes(diabetes ~., data = train.pima.indians)

model.naive
$apriori
grouping
      neg       pos 
0.6687898 0.3312102 

$tables
$tables$pregnant
        [,1]     [,2]
neg 2.890476 2.645282
pos 4.471154 3.968215

$tables$glucose
        [,1]     [,2]
neg 112.1381 25.23393
pos 147.1635 29.39590

$tables$pressure
        [,1]     [,2]
neg 69.19048 12.13173
pos 73.54808 13.71264

$tables$triceps
        [,1]      [,2]
neg 27.83333 10.668822
pos 32.68269  9.539152

$tables$insulin
        [,1]     [,2]
neg 137.0571 110.5755
pos 214.2115 140.5018

$tables$mass
        [,1]     [,2]
neg 32.11524 6.905892
pos 35.42500 6.825278

$tables$pedigree
         [,1]      [,2]
neg 0.4699952 0.3091545
pos 0.6139904 0.3853125

$tables$age
       [,1]      [,2]
neg 28.6619  9.085715
pos 35.8750 10.248076


$levels
[1] "neg" "pos"

$call
NaiveBayes.default(x = X, grouping = Y)

$x
    pregnant glucose pressure triceps insulin mass pedigree age
4          1      89       66      23      94 28.1    0.167  21
5          0     137       40      35     168 43.1    2.288  33
7          3      78       50      32      88 31.0    0.248  26
9          2     197       70      45     543 30.5    0.158  53
14         1     189       60      23     846 30.1    0.398  59
15         5     166       72      19     175 25.8    0.587  51
17         0     118       84      47     230 45.8    0.551  31
19         1     103       30      38      83 43.3    0.183  33
20         1     115       70      30      96 34.6    0.529  32
26        10     125       70      26     115 31.1    0.205  41
33         3      88       58      11      54 24.8    0.267  22
44         9     171      110      24     240 45.4    0.721  54
51         1     103       80      11      82 19.4    0.491  22
52         1     101       50      15      36 24.2    0.526  26
53         5      88       66      21      23 24.4    0.342  30
54         8     176       90      34     300 33.7    0.467  58
55         7     150       66      42     342 34.7    0.718  42
57         7     187       68      39     304 37.7    0.254  41
58         0     100       88      60     110 46.8    0.962  31
64         2     141       58      34     128 25.4    0.699  24
69         1      95       66      13      38 19.6    0.334  25
70         4     146       85      27     100 28.9    0.189  27
71         2     100       66      20      90 32.9    0.867  28
74         4     129       86      20     270 35.1    0.231  23
83         7      83       78      26      71 29.3    0.767  36
86         2     110       74      29     125 32.4    0.698  27
88         2     100       68      25      71 38.5    0.324  26
89        15     136       70      32     110 37.1    0.153  43
92         4     123       80      15     176 32.0    0.443  34
93         7      81       78      40      48 46.7    0.261  42
95         2     142       82      18      64 24.7    0.761  21
96         6     144       72      27     228 33.9    0.255  40
98         1      71       48      18      76 20.4    0.323  22
99         6      93       50      30      64 28.7    0.356  23
100        1     122       90      51     220 49.7    0.325  31
104        1      81       72      18      40 26.6    0.283  24
108        4     144       58      28     140 29.5    0.287  37
109        3      83       58      31      18 34.3    0.336  25
112        8     155       62      26     495 34.0    0.543  46
113        1      89       76      34      37 31.2    0.192  23
115        7     160       54      32     175 30.5    0.588  39
120        4      99       76      15      51 23.2    0.223  21
121        0     162       76      56     100 53.2    0.759  25
123        2     107       74      30     100 33.6    0.404  23
126        1      88       30      42      99 55.0    0.496  26
127        3     120       70      30     135 42.9    0.452  30
128        1     118       58      36      94 33.3    0.261  23
129        1     117       88      24     145 34.5    0.403  40
131        4     173       70      14     168 29.7    0.361  33
133        3     170       64      37     225 34.5    0.356  30
135        2      96       68      13      49 21.1    0.647  26
136        2     125       60      20     140 33.8    0.088  31
140        5     105       72      29     325 36.9    0.159  28
143        2     108       52      26      63 32.5    0.318  22
145        4     154       62      31     284 32.8    0.237  23
148        2     106       64      35     119 30.5    1.400  34
151        1     136       74      50     204 37.4    0.399  24
153        9     156       86      28     155 34.3    1.189  42
154        1     153       82      42     485 40.6    0.687  23
158        1     109       56      21     135 25.2    0.833  23
159        2      88       74      19      53 29.0    0.229  22
160       17     163       72      41     114 40.9    0.817  47
162        7     102       74      40     105 37.2    0.204  45
163        0     114       80      34     285 44.2    0.167  27
172        6     134       70      23     130 35.4    0.542  29
174        1      79       60      42      48 43.5    0.678  23
175        2      75       64      24      55 29.7    0.370  33
178        0     129      110      46     130 67.1    0.319  26
182        0     119       64      18      92 34.9    0.725  23
187        8     181       68      36     495 30.1    0.615  60
188        1     128       98      41      58 32.0    1.321  33
189        8     109       76      39     114 27.9    0.640  31
190        5     139       80      35     160 31.6    0.361  25
192        9     123       70      44      94 33.1    0.374  40
196        5     158       84      41     210 39.4    0.395  29
198        3     107       62      13      48 22.9    0.678  23
199        4     109       64      44      99 34.8    0.905  26
200        4     148       60      27     318 30.9    0.150  29
204        2      99       70      16      44 20.4    0.235  27
205        6     103       72      32     190 37.7    0.324  55
214        0     140       65      26     130 42.6    0.431  24
215        9     112       82      32     175 34.2    0.260  36
216       12     151       70      40     271 41.8    0.742  38
218        6     125       68      30     120 30.0    0.464  32
221        0     177       60      29     478 34.6    1.072  21
225        1     100       66      15      56 23.6    0.666  26
226        1      87       78      27      32 34.6    0.101  22
229        4     197       70      39     744 36.7    2.329  31
230        0     117       80      31      53 45.2    0.089  24
232        6     134       80      37     370 46.2    0.238  46
233        1      79       80      25      37 25.4    0.583  22
235        3      74       68      28      45 29.7    0.293  23
237        7     181       84      21     192 35.9    0.586  51
242        4      91       70      32      88 33.1    0.446  22
244        6     119       50      22     176 27.1    1.318  33
245        2     146       76      35     194 38.2    0.329  29
248        0     165       90      33     680 52.3    0.427  23
249        9     124       70      33     402 35.4    0.282  34
255       12      92       62       7     258 27.6    0.926  44
259        1     193       50      16     375 25.9    0.655  24
260       11     155       76      28     150 33.3    1.353  51
261        3     191       68      15     130 30.9    0.299  34
266        5      96       74      18      67 33.6    0.997  43
272        2     108       62      32      56 25.2    0.128  21
274        1      71       78      50      45 33.2    0.422  21
276        2     100       70      52      57 40.5    0.677  25
280        2     108       62      10     278 25.3    0.881  22
282       10     129       76      28     122 35.9    0.280  39
283        7     133       88      15     155 32.4    0.262  37
286        7     136       74      26     135 26.0    0.647  51
287        5     155       84      44     545 38.7    0.619  34
290        5     108       72      43      75 36.1    0.263  33
291        0      78       88      29      40 36.9    0.434  21
292        0     107       62      30      74 36.6    0.757  25
293        2     128       78      37     182 43.3    1.224  31
294        1     128       48      45     194 40.5    0.613  24
296        6     151       62      31     120 35.5    0.692  28
297        2     146       70      38     360 28.0    0.337  29
298        0     126       84      29     215 30.7    0.520  24
306        2     120       76      37     105 39.7    0.215  29
307       10     161       68      23     132 25.5    0.326  47
308        0     137       68      14     148 24.8    0.143  21
310        2     124       68      28     205 32.9    0.875  30
312        0     106       70      37     148 39.4    0.605  22
313        2     155       74      17      96 26.6    0.433  27
 [ reached 'max' / getOption("max.print") -- omitted 189 rows ]

$usekernel
[1] FALSE

$varnames
[1] "pregnant" "glucose"  "pressure" "triceps"  "insulin"  "mass"     "pedigree" "age"     

attr(,"class")
[1] "NaiveBayes"
# Make predictions
prediction.naive <- model.naive %>% predict(test.pima.indians)
Numerical 0 probability for all classes with observation 1Numerical 0 probability for all classes with observation 2Numerical 0 probability for all classes with observation 3Numerical 0 probability for all classes with observation 4Numerical 0 probability for all classes with observation 5Numerical 0 probability for all classes with observation 6Numerical 0 probability for all classes with observation 7Numerical 0 probability for all classes with observation 8Numerical 0 probability for all classes with observation 9Numerical 0 probability for all classes with observation 10Numerical 0 probability for all classes with observation 11Numerical 0 probability for all classes with observation 12Numerical 0 probability for all classes with observation 13Numerical 0 probability for all classes with observation 14Numerical 0 probability for all classes with observation 15Numerical 0 probability for all classes with observation 16Numerical 0 probability for all classes with observation 17Numerical 0 probability for all classes with observation 18Numerical 0 probability for all classes with observation 19Numerical 0 probability for all classes with observation 20Numerical 0 probability for all classes with observation 21Numerical 0 probability for all classes with observation 22Numerical 0 probability for all classes with observation 23Numerical 0 probability for all classes with observation 24Numerical 0 probability for all classes with observation 25Numerical 0 probability for all classes with observation 26Numerical 0 probability for all classes with observation 27Numerical 0 probability for all classes with observation 28Numerical 0 probability for all classes with observation 29Numerical 0 probability for all classes with observation 30Numerical 0 probability for all classes with observation 31Numerical 0 probability for all classes with observation 32Numerical 0 probability for all classes with observation 33Numerical 0 probability for all classes with observation 34Numerical 0 probability for all classes with observation 35Numerical 0 probability for all classes with observation 36Numerical 0 probability for all classes with observation 37Numerical 0 probability for all classes with observation 38Numerical 0 probability for all classes with observation 39Numerical 0 probability for all classes with observation 40Numerical 0 probability for all classes with observation 41Numerical 0 probability for all classes with observation 42Numerical 0 probability for all classes with observation 43Numerical 0 probability for all classes with observation 44Numerical 0 probability for all classes with observation 45Numerical 0 probability for all classes with observation 46Numerical 0 probability for all classes with observation 47Numerical 0 probability for all classes with observation 48Numerical 0 probability for all classes with observation 49Numerical 0 probability for all classes with observation 50Numerical 0 probability for all classes with observation 51Numerical 0 probability for all classes with observation 52Numerical 0 probability for all classes with observation 53Numerical 0 probability for all classes with observation 54Numerical 0 probability for all classes with observation 55Numerical 0 probability for all classes with observation 56Numerical 0 probability for all classes with observation 57Numerical 0 probability for all classes with observation 58Numerical 0 probability for all classes with observation 59Numerical 0 probability for all classes with observation 60Numerical 0 probability for all classes with observation 61Numerical 0 probability for all classes with observation 62Numerical 0 probability for all classes with observation 63Numerical 0 probability for all classes with observation 64Numerical 0 probability for all classes with observation 65Numerical 0 probability for all classes with observation 66Numerical 0 probability for all classes with observation 67Numerical 0 probability for all classes with observation 68Numerical 0 probability for all classes with observation 69Numerical 0 probability for all classes with observation 70Numerical 0 probability for all classes with observation 71Numerical 0 probability for all classes with observation 72Numerical 0 probability for all classes with observation 73Numerical 0 probability for all classes with observation 74Numerical 0 probability for all classes with observation 75Numerical 0 probability for all classes with observation 76Numerical 0 probability for all classes with observation 77Numerical 0 probability for all classes with observation 78
# Model accuracy
mean(prediction.naive$class == test.pima.indians$diabetes)
[1] 0.8205128

Using caret R package

The caet R package can automatically train the model and assess the modle accuracy using k-fold cross-validation

library(klaR)

set.seed(123)

model.naive2 <- train(diabetes~., data = train.pima.indians, method = "nb", trControl = trainControl("cv", number = 10))
Numerical 0 probability for all classes with observation 1Numerical 0 probability for all classes with observation 2Numerical 0 probability for all classes with observation 3Numerical 0 probability for all classes with observation 4Numerical 0 probability for all classes with observation 5Numerical 0 probability for all classes with observation 6Numerical 0 probability for all classes with observation 7Numerical 0 probability for all classes with observation 8Numerical 0 probability for all classes with observation 9Numerical 0 probability for all classes with observation 10Numerical 0 probability for all classes with observation 11Numerical 0 probability for all classes with observation 12Numerical 0 probability for all classes with observation 13Numerical 0 probability for all classes with observation 14Numerical 0 probability for all classes with observation 15Numerical 0 probability for all classes with observation 16Numerical 0 probability for all classes with observation 17Numerical 0 probability for all classes with observation 18Numerical 0 probability for all classes with observation 19Numerical 0 probability for all classes with observation 20Numerical 0 probability for all classes with observation 21Numerical 0 probability for all classes with observation 22Numerical 0 probability for all classes with observation 23Numerical 0 probability for all classes with observation 24Numerical 0 probability for all classes with observation 25Numerical 0 probability for all classes with observation 26Numerical 0 probability for all classes with observation 27Numerical 0 probability for all classes with observation 28Numerical 0 probability for all classes with observation 29Numerical 0 probability for all classes with observation 30Numerical 0 probability for all classes with observation 31Numerical 0 probability for all classes with observation 1Numerical 0 probability for all classes with observation 2Numerical 0 probability for all classes with observation 3Numerical 0 probability for all classes with observation 4Numerical 0 probability for all classes with observation 5Numerical 0 probability for all classes with observation 6Numerical 0 probability for all classes with observation 7Numerical 0 probability for all classes with observation 8Numerical 0 probability for all classes with observation 9Numerical 0 probability for all classes with observation 10Numerical 0 probability for all classes with observation 11Numerical 0 probability for all classes with observation 12Numerical 0 probability for all classes with observation 13Numerical 0 probability for all classes with observation 14Numerical 0 probability for all classes with observation 15Numerical 0 probability for all classes with observation 16Numerical 0 probability for all classes with observation 17Numerical 0 probability for all classes with observation 18Numerical 0 probability for all classes with observation 19Numerical 0 probability for all classes with observation 20Numerical 0 probability for all classes with observation 21Numerical 0 probability for all classes with observation 22Numerical 0 probability for all classes with observation 23Numerical 0 probability for all classes with observation 24Numerical 0 probability for all classes with observation 25Numerical 0 probability for all classes with observation 26Numerical 0 probability for all classes with observation 27Numerical 0 probability for all classes with observation 28Numerical 0 probability for all classes with observation 29Numerical 0 probability for all classes with observation 30Numerical 0 probability for all classes with observation 31Numerical 0 probability for all classes with observation 1Numerical 0 probability for all classes with observation 2Numerical 0 probability for all classes with observation 3Numerical 0 probability for all classes with observation 4Numerical 0 probability for all classes with observation 5Numerical 0 probability for all classes with observation 6Numerical 0 probability for all classes with observation 7Numerical 0 probability for all classes with observation 8Numerical 0 probability for all classes with observation 9Numerical 0 probability for all classes with observation 10Numerical 0 probability for all classes with observation 11Numerical 0 probability for all classes with observation 12Numerical 0 probability for all classes with observation 13Numerical 0 probability for all classes with observation 14Numerical 0 probability for all classes with observation 15Numerical 0 probability for all classes with observation 16Numerical 0 probability for all classes with observation 17Numerical 0 probability for all classes with observation 18Numerical 0 probability for all classes with observation 19Numerical 0 probability for all classes with observation 20Numerical 0 probability for all classes with observation 21Numerical 0 probability for all classes with observation 22Numerical 0 probability for all classes with observation 23Numerical 0 probability for all classes with observation 24Numerical 0 probability for all classes with observation 25Numerical 0 probability for all classes with observation 26Numerical 0 probability for all classes with observation 27Numerical 0 probability for all classes with observation 28Numerical 0 probability for all classes with observation 29Numerical 0 probability for all classes with observation 30Numerical 0 probability for all classes with observation 31Numerical 0 probability for all classes with observation 1Numerical 0 probability for all classes with observation 2Numerical 0 probability for all classes with observation 3Numerical 0 probability for all classes with observation 4Numerical 0 probability for all classes with observation 5Numerical 0 probability for all classes with observation 6Numerical 0 probability for all classes with observation 7Numerical 0 probability for all classes with observation 8Numerical 0 probability for all classes with observation 9Numerical 0 probability for all classes with observation 10Numerical 0 probability for all classes with observation 11Numerical 0 probability for all classes with observation 12Numerical 0 probability for all classes with observation 13Numerical 0 probability for all classes with observation 14Numerical 0 probability for all classes with observation 15Numerical 0 probability for all classes with observation 16Numerical 0 probability for all classes with observation 17Numerical 0 probability for all classes with observation 18Numerical 0 probability for all classes with observation 19Numerical 0 probability for all classes with observation 20Numerical 0 probability for all classes with observation 21Numerical 0 probability for all classes with observation 22Numerical 0 probability for all classes with observation 23Numerical 0 probability for all classes with observation 24Numerical 0 probability for all classes with observation 25Numerical 0 probability for all classes with observation 26Numerical 0 probability for all classes with observation 27Numerical 0 probability for all classes with observation 28Numerical 0 probability for all classes with observation 29Numerical 0 probability for all classes with observation 30Numerical 0 probability for all classes with observation 31Numerical 0 probability for all classes with observation 1Numerical 0 probability for all classes with observation 2Numerical 0 probability for all classes with observation 3Numerical 0 probability for all classes with observation 4Numerical 0 probability for all classes with observation 5Numerical 0 probability for all classes with observation 6Numerical 0 probability for all classes with observation 7Numerical 0 probability for all classes with observation 8Numerical 0 probability for all classes with observation 9Numerical 0 probability for all classes with observation 10Numerical 0 probability for all classes with observation 11Numerical 0 probability for all classes with observation 12Numerical 0 probability for all classes with observation 13Numerical 0 probability for all classes with observation 14Numerical 0 probability for all classes with observation 15Numerical 0 probability for all classes with observation 16Numerical 0 probability for all classes with observation 17Numerical 0 probability for all classes with observation 18Numerical 0 probability for all classes with observation 19Numerical 0 probability for all classes with observation 20Numerical 0 probability for all classes with observation 21Numerical 0 probability for all classes with observation 22Numerical 0 probability for all classes with observation 23Numerical 0 probability for all classes with observation 24Numerical 0 probability for all classes with observation 25Numerical 0 probability for all classes with observation 26Numerical 0 probability for all classes with observation 27Numerical 0 probability for all classes with observation 28Numerical 0 probability for all classes with observation 29Numerical 0 probability for all classes with observation 30Numerical 0 probability for all classes with observation 31Numerical 0 probability for all classes with observation 32Numerical 0 probability for all classes with observation 1Numerical 0 probability for all classes with observation 2Numerical 0 probability for all classes with observation 3Numerical 0 probability for all classes with observation 4Numerical 0 probability for all classes with observation 5Numerical 0 probability for all classes with observation 6Numerical 0 probability for all classes with observation 7Numerical 0 probability for all classes with observation 8Numerical 0 probability for all classes with observation 9Numerical 0 probability for all classes with observation 10Numerical 0 probability for all classes with observation 11Numerical 0 probability for all classes with observation 12Numerical 0 probability for all classes with observation 13Numerical 0 probability for all classes with observation 14Numerical 0 probability for all classes with observation 15Numerical 0 probability for all classes with observation 16Numerical 0 probability for all classes with observation 17Numerical 0 probability for all classes with observation 18Numerical 0 probability for all classes with observation 19Numerical 0 probability for all classes with observation 20Numerical 0 probability for all classes with observation 21Numerical 0 probability for all classes with observation 22Numerical 0 probability for all classes with observation 23Numerical 0 probability for all classes with observation 24Numerical 0 probability for all classes with observation 25Numerical 0 probability for all classes with observation 26Numerical 0 probability for all classes with observation 27Numerical 0 probability for all classes with observation 28Numerical 0 probability for all classes with observation 29Numerical 0 probability for all classes with observation 30Numerical 0 probability for all classes with observation 31Numerical 0 probability for all classes with observation 32Numerical 0 probability for all classes with observation 1Numerical 0 probability for all classes with observation 2Numerical 0 probability for all classes with observation 3Numerical 0 probability for all classes with observation 4Numerical 0 probability for all classes with observation 5Numerical 0 probability for all classes with observation 6Numerical 0 probability for all classes with observation 7Numerical 0 probability for all classes with observation 8Numerical 0 probability for all classes with observation 9Numerical 0 probability for all classes with observation 10Numerical 0 probability for all classes with observation 11Numerical 0 probability for all classes with observation 12Numerical 0 probability for all classes with observation 13Numerical 0 probability for all classes with observation 14Numerical 0 probability for all classes with observation 15Numerical 0 probability for all classes with observation 16Numerical 0 probability for all classes with observation 17Numerical 0 probability for all classes with observation 18Numerical 0 probability for all classes with observation 19Numerical 0 probability for all classes with observation 20Numerical 0 probability for all classes with observation 21Numerical 0 probability for all classes with observation 22Numerical 0 probability for all classes with observation 23Numerical 0 probability for all classes with observation 24Numerical 0 probability for all classes with observation 25Numerical 0 probability for all classes with observation 26Numerical 0 probability for all classes with observation 27Numerical 0 probability for all classes with observation 28Numerical 0 probability for all classes with observation 29Numerical 0 probability for all classes with observation 30Numerical 0 probability for all classes with observation 31Numerical 0 probability for all classes with observation 1Numerical 0 probability for all classes with observation 2Numerical 0 probability for all classes with observation 3Numerical 0 probability for all classes with observation 4Numerical 0 probability for all classes with observation 5Numerical 0 probability for all classes with observation 6Numerical 0 probability for all classes with observation 7Numerical 0 probability for all classes with observation 8Numerical 0 probability for all classes with observation 9Numerical 0 probability for all classes with observation 10Numerical 0 probability for all classes with observation 11Numerical 0 probability for all classes with observation 12Numerical 0 probability for all classes with observation 13Numerical 0 probability for all classes with observation 14Numerical 0 probability for all classes with observation 15Numerical 0 probability for all classes with observation 16Numerical 0 probability for all classes with observation 17Numerical 0 probability for all classes with observation 18Numerical 0 probability for all classes with observation 19Numerical 0 probability for all classes with observation 20Numerical 0 probability for all classes with observation 21Numerical 0 probability for all classes with observation 22Numerical 0 probability for all classes with observation 23Numerical 0 probability for all classes with observation 24Numerical 0 probability for all classes with observation 25Numerical 0 probability for all classes with observation 26Numerical 0 probability for all classes with observation 27Numerical 0 probability for all classes with observation 28Numerical 0 probability for all classes with observation 29Numerical 0 probability for all classes with observation 30Numerical 0 probability for all classes with observation 31Numerical 0 probability for all classes with observation 1Numerical 0 probability for all classes with observation 2Numerical 0 probability for all classes with observation 3Numerical 0 probability for all classes with observation 4Numerical 0 probability for all classes with observation 5Numerical 0 probability for all classes with observation 6Numerical 0 probability for all classes with observation 7Numerical 0 probability for all classes with observation 8Numerical 0 probability for all classes with observation 9Numerical 0 probability for all classes with observation 10Numerical 0 probability for all classes with observation 11Numerical 0 probability for all classes with observation 12Numerical 0 probability for all classes with observation 13Numerical 0 probability for all classes with observation 14Numerical 0 probability for all classes with observation 15Numerical 0 probability for all classes with observation 16Numerical 0 probability for all classes with observation 17Numerical 0 probability for all classes with observation 18Numerical 0 probability for all classes with observation 19Numerical 0 probability for all classes with observation 20Numerical 0 probability for all classes with observation 21Numerical 0 probability for all classes with observation 22Numerical 0 probability for all classes with observation 23Numerical 0 probability for all classes with observation 24Numerical 0 probability for all classes with observation 25Numerical 0 probability for all classes with observation 26Numerical 0 probability for all classes with observation 27Numerical 0 probability for all classes with observation 28Numerical 0 probability for all classes with observation 29Numerical 0 probability for all classes with observation 30Numerical 0 probability for all classes with observation 31Numerical 0 probability for all classes with observation 1Numerical 0 probability for all classes with observation 2Numerical 0 probability for all classes with observation 3Numerical 0 probability for all classes with observation 4Numerical 0 probability for all classes with observation 5Numerical 0 probability for all classes with observation 6Numerical 0 probability for all classes with observation 7Numerical 0 probability for all classes with observation 8Numerical 0 probability for all classes with observation 9Numerical 0 probability for all classes with observation 10Numerical 0 probability for all classes with observation 11Numerical 0 probability for all classes with observation 12Numerical 0 probability for all classes with observation 13Numerical 0 probability for all classes with observation 14Numerical 0 probability for all classes with observation 15Numerical 0 probability for all classes with observation 16Numerical 0 probability for all classes with observation 17Numerical 0 probability for all classes with observation 18Numerical 0 probability for all classes with observation 19Numerical 0 probability for all classes with observation 20Numerical 0 probability for all classes with observation 21Numerical 0 probability for all classes with observation 22Numerical 0 probability for all classes with observation 23Numerical 0 probability for all classes with observation 24Numerical 0 probability for all classes with observation 25Numerical 0 probability for all classes with observation 26Numerical 0 probability for all classes with observation 27Numerical 0 probability for all classes with observation 28Numerical 0 probability for all classes with observation 29Numerical 0 probability for all classes with observation 30Numerical 0 probability for all classes with observation 31Numerical 0 probability for all classes with observation 1Numerical 0 probability for all classes with observation 2Numerical 0 probability for all classes with observation 3Numerical 0 probability for all classes with observation 4Numerical 0 probability for all classes with observation 5Numerical 0 probability for all classes with observation 6Numerical 0 probability for all classes with observation 7Numerical 0 probability for all classes with observation 8Numerical 0 probability for all classes with observation 9Numerical 0 probability for all classes with observation 10Numerical 0 probability for all classes with observation 11Numerical 0 probability for all classes with observation 12Numerical 0 probability for all classes with observation 13Numerical 0 probability for all classes with observation 14Numerical 0 probability for all classes with observation 15Numerical 0 probability for all classes with observation 16Numerical 0 probability for all classes with observation 17Numerical 0 probability for all classes with observation 18Numerical 0 probability for all classes with observation 19Numerical 0 probability for all classes with observation 20Numerical 0 probability for all classes with observation 21Numerical 0 probability for all classes with observation 22Numerical 0 probability for all classes with observation 23Numerical 0 probability for all classes with observation 24Numerical 0 probability for all classes with observation 25Numerical 0 probability for all classes with observation 26Numerical 0 probability for all classes with observation 27Numerical 0 probability for all classes with observation 28Numerical 0 probability for all classes with observation 29Numerical 0 probability for all classes with observation 30Numerical 0 probability for all classes with observation 31Numerical 0 probability for all classes with observation 32Numerical 0 probability for all classes with observation 1Numerical 0 probability for all classes with observation 2Numerical 0 probability for all classes with observation 3Numerical 0 probability for all classes with observation 4Numerical 0 probability for all classes with observation 5Numerical 0 probability for all classes with observation 6Numerical 0 probability for all classes with observation 7Numerical 0 probability for all classes with observation 8Numerical 0 probability for all classes with observation 9Numerical 0 probability for all classes with observation 10Numerical 0 probability for all classes with observation 11Numerical 0 probability for all classes with observation 12Numerical 0 probability for all classes with observation 13Numerical 0 probability for all classes with observation 14Numerical 0 probability for all classes with observation 15Numerical 0 probability for all classes with observation 16Numerical 0 probability for all classes with observation 17Numerical 0 probability for all classes with observation 18Numerical 0 probability for all classes with observation 19Numerical 0 probability for all classes with observation 20Numerical 0 probability for all classes with observation 21Numerical 0 probability for all classes with observation 22Numerical 0 probability for all classes with observation 23Numerical 0 probability for all classes with observation 24Numerical 0 probability for all classes with observation 25Numerical 0 probability for all classes with observation 26Numerical 0 probability for all classes with observation 27Numerical 0 probability for all classes with observation 28Numerical 0 probability for all classes with observation 29Numerical 0 probability for all classes with observation 30Numerical 0 probability for all classes with observation 31Numerical 0 probability for all classes with observation 32Numerical 0 probability for all classes with observation 1Numerical 0 probability for all classes with observation 2Numerical 0 probability for all classes with observation 3Numerical 0 probability for all classes with observation 4Numerical 0 probability for all classes with observation 5Numerical 0 probability for all classes with observation 6Numerical 0 probability for all classes with observation 7Numerical 0 probability for all classes with observation 8Numerical 0 probability for all classes with observation 9Numerical 0 probability for all classes with observation 10Numerical 0 probability for all classes with observation 11Numerical 0 probability for all classes with observation 12Numerical 0 probability for all classes with observation 13Numerical 0 probability for all classes with observation 14Numerical 0 probability for all classes with observation 15Numerical 0 probability for all classes with observation 16Numerical 0 probability for all classes with observation 17Numerical 0 probability for all classes with observation 18Numerical 0 probability for all classes with observation 19Numerical 0 probability for all classes with observation 20Numerical 0 probability for all classes with observation 21Numerical 0 probability for all classes with observation 22Numerical 0 probability for all classes with observation 23Numerical 0 probability for all classes with observation 24Numerical 0 probability for all classes with observation 25Numerical 0 probability for all classes with observation 26Numerical 0 probability for all classes with observation 27Numerical 0 probability for all classes with observation 28Numerical 0 probability for all classes with observation 29Numerical 0 probability for all classes with observation 30Numerical 0 probability for all classes with observation 31Numerical 0 probability for all classes with observation 32Numerical 0 probability for all classes with observation 1Numerical 0 probability for all classes with observation 2Numerical 0 probability for all classes with observation 3Numerical 0 probability for all classes with observation 4Numerical 0 probability for all classes with observation 5Numerical 0 probability for all classes with observation 6Numerical 0 probability for all classes with observation 7Numerical 0 probability for all classes with observation 8Numerical 0 probability for all classes with observation 9Numerical 0 probability for all classes with observation 10Numerical 0 probability for all classes with observation 11Numerical 0 probability for all classes with observation 12Numerical 0 probability for all classes with observation 13Numerical 0 probability for all classes with observation 14Numerical 0 probability for all classes with observation 15Numerical 0 probability for all classes with observation 16Numerical 0 probability for all classes with observation 17Numerical 0 probability for all classes with observation 18Numerical 0 probability for all classes with observation 19Numerical 0 probability for all classes with observation 20Numerical 0 probability for all classes with observation 21Numerical 0 probability for all classes with observation 22Numerical 0 probability for all classes with observation 23Numerical 0 probability for all classes with observation 24Numerical 0 probability for all classes with observation 25Numerical 0 probability for all classes with observation 26Numerical 0 probability for all classes with observation 27Numerical 0 probability for all classes with observation 28Numerical 0 probability for all classes with observation 29Numerical 0 probability for all classes with observation 30Numerical 0 probability for all classes with observation 31Numerical 0 probability for all classes with observation 32Numerical 0 probability for all classes with observation 1Numerical 0 probability for all classes with observation 2Numerical 0 probability for all classes with observation 3Numerical 0 probability for all classes with observation 4Numerical 0 probability for all classes with observation 5Numerical 0 probability for all classes with observation 6Numerical 0 probability for all classes with observation 7Numerical 0 probability for all classes with observation 8Numerical 0 probability for all classes with observation 9Numerical 0 probability for all classes with observation 10Numerical 0 probability for all classes with observation 11Numerical 0 probability for all classes with observation 12Numerical 0 probability for all classes with observation 13Numerical 0 probability for all classes with observation 14Numerical 0 probability for all classes with observation 15Numerical 0 probability for all classes with observation 16Numerical 0 probability for all classes with observation 17Numerical 0 probability for all classes with observation 18Numerical 0 probability for all classes with observation 19Numerical 0 probability for all classes with observation 20Numerical 0 probability for all classes with observation 21Numerical 0 probability for all classes with observation 22Numerical 0 probability for all classes with observation 23Numerical 0 probability for all classes with observation 24Numerical 0 probability for all classes with observation 25Numerical 0 probability for all classes with observation 26Numerical 0 probability for all classes with observation 27Numerical 0 probability for all classes with observation 28Numerical 0 probability for all classes with observation 29Numerical 0 probability for all classes with observation 30Numerical 0 probability for all classes with observation 31Numerical 0 probability for all classes with observation 1Numerical 0 probability for all classes with observation 2Numerical 0 probability for all classes with observation 3Numerical 0 probability for all classes with observation 4Numerical 0 probability for all classes with observation 5Numerical 0 probability for all classes with observation 6Numerical 0 probability for all classes with observation 7Numerical 0 probability for all classes with observation 8Numerical 0 probability for all classes with observation 9Numerical 0 probability for all classes with observation 10Numerical 0 probability for all classes with observation 11Numerical 0 probability for all classes with observation 12Numerical 0 probability for all classes with observation 13Numerical 0 probability for all classes with observation 14Numerical 0 probability for all classes with observation 15Numerical 0 probability for all classes with observation 16Numerical 0 probability for all classes with observation 17Numerical 0 probability for all classes with observation 18Numerical 0 probability for all classes with observation 19Numerical 0 probability for all classes with observation 20Numerical 0 probability for all classes with observation 21Numerical 0 probability for all classes with observation 22Numerical 0 probability for all classes with observation 23Numerical 0 probability for all classes with observation 24Numerical 0 probability for all classes with observation 25Numerical 0 probability for all classes with observation 26Numerical 0 probability for all classes with observation 27Numerical 0 probability for all classes with observation 28Numerical 0 probability for all classes with observation 29Numerical 0 probability for all classes with observation 30Numerical 0 probability for all classes with observation 31Numerical 0 probability for all classes with observation 1Numerical 0 probability for all classes with observation 2Numerical 0 probability for all classes with observation 3Numerical 0 probability for all classes with observation 4Numerical 0 probability for all classes with observation 5Numerical 0 probability for all classes with observation 6Numerical 0 probability for all classes with observation 7Numerical 0 probability for all classes with observation 8Numerical 0 probability for all classes with observation 9Numerical 0 probability for all classes with observation 10Numerical 0 probability for all classes with observation 11Numerical 0 probability for all classes with observation 12Numerical 0 probability for all classes with observation 13Numerical 0 probability for all classes with observation 14Numerical 0 probability for all classes with observation 15Numerical 0 probability for all classes with observation 16Numerical 0 probability for all classes with observation 17Numerical 0 probability for all classes with observation 18Numerical 0 probability for all classes with observation 19Numerical 0 probability for all classes with observation 20Numerical 0 probability for all classes with observation 21Numerical 0 probability for all classes with observation 22Numerical 0 probability for all classes with observation 23Numerical 0 probability for all classes with observation 24Numerical 0 probability for all classes with observation 25Numerical 0 probability for all classes with observation 26Numerical 0 probability for all classes with observation 27Numerical 0 probability for all classes with observation 28Numerical 0 probability for all classes with observation 29Numerical 0 probability for all classes with observation 30Numerical 0 probability for all classes with observation 31Numerical 0 probability for all classes with observation 1Numerical 0 probability for all classes with observation 2Numerical 0 probability for all classes with observation 3Numerical 0 probability for all classes with observation 4Numerical 0 probability for all classes with observation 5Numerical 0 probability for all classes with observation 6Numerical 0 probability for all classes with observation 7Numerical 0 probability for all classes with observation 8Numerical 0 probability for all classes with observation 9Numerical 0 probability for all classes with observation 10Numerical 0 probability for all classes with observation 11Numerical 0 probability for all classes with observation 12Numerical 0 probability for all classes with observation 13Numerical 0 probability for all classes with observation 14Numerical 0 probability for all classes with observation 15Numerical 0 probability for all classes with observation 16Numerical 0 probability for all classes with observation 17Numerical 0 probability for all classes with observation 18Numerical 0 probability for all classes with observation 19Numerical 0 probability for all classes with observation 20Numerical 0 probability for all classes with observation 21Numerical 0 probability for all classes with observation 22Numerical 0 probability for all classes with observation 23Numerical 0 probability for all classes with observation 24Numerical 0 probability for all classes with observation 25Numerical 0 probability for all classes with observation 26Numerical 0 probability for all classes with observation 27Numerical 0 probability for all classes with observation 28Numerical 0 probability for all classes with observation 29Numerical 0 probability for all classes with observation 30Numerical 0 probability for all classes with observation 31Numerical 0 probability for all classes with observation 1Numerical 0 probability for all classes with observation 2Numerical 0 probability for all classes with observation 3Numerical 0 probability for all classes with observation 4Numerical 0 probability for all classes with observation 5Numerical 0 probability for all classes with observation 6Numerical 0 probability for all classes with observation 7Numerical 0 probability for all classes with observation 8Numerical 0 probability for all classes with observation 9Numerical 0 probability for all classes with observation 10Numerical 0 probability for all classes with observation 11Numerical 0 probability for all classes with observation 12Numerical 0 probability for all classes with observation 13Numerical 0 probability for all classes with observation 14Numerical 0 probability for all classes with observation 15Numerical 0 probability for all classes with observation 16Numerical 0 probability for all classes with observation 17Numerical 0 probability for all classes with observation 18Numerical 0 probability for all classes with observation 19Numerical 0 probability for all classes with observation 20Numerical 0 probability for all classes with observation 21Numerical 0 probability for all classes with observation 22Numerical 0 probability for all classes with observation 23Numerical 0 probability for all classes with observation 24Numerical 0 probability for all classes with observation 25Numerical 0 probability for all classes with observation 26Numerical 0 probability for all classes with observation 27Numerical 0 probability for all classes with observation 28Numerical 0 probability for all classes with observation 29Numerical 0 probability for all classes with observation 30Numerical 0 probability for all classes with observation 31Numerical 0 probability for all classes with observation 32Numerical 0 probability for all classes with observation 1Numerical 0 probability for all classes with observation 2Numerical 0 probability for all classes with observation 3Numerical 0 probability for all classes with observation 4Numerical 0 probability for all classes with observation 5Numerical 0 probability for all classes with observation 6Numerical 0 probability for all classes with observation 7Numerical 0 probability for all classes with observation 8Numerical 0 probability for all classes with observation 9Numerical 0 probability for all classes with observation 10Numerical 0 probability for all classes with observation 11Numerical 0 probability for all classes with observation 12Numerical 0 probability for all classes with observation 13Numerical 0 probability for all classes with observation 14Numerical 0 probability for all classes with observation 15Numerical 0 probability for all classes with observation 16Numerical 0 probability for all classes with observation 17Numerical 0 probability for all classes with observation 18Numerical 0 probability for all classes with observation 19Numerical 0 probability for all classes with observation 20Numerical 0 probability for all classes with observation 21Numerical 0 probability for all classes with observation 22Numerical 0 probability for all classes with observation 23Numerical 0 probability for all classes with observation 24Numerical 0 probability for all classes with observation 25Numerical 0 probability for all classes with observation 26Numerical 0 probability for all classes with observation 27Numerical 0 probability for all classes with observation 28Numerical 0 probability for all classes with observation 29Numerical 0 probability for all classes with observation 30Numerical 0 probability for all classes with observation 31Numerical 0 probability for all classes with observation 32
# make predictions
predicted.classes.naive2 <- model.naive2 %>% predict(test.pima.indians)
Numerical 0 probability for all classes with observation 1Numerical 0 probability for all classes with observation 2Numerical 0 probability for all classes with observation 3Numerical 0 probability for all classes with observation 4Numerical 0 probability for all classes with observation 5Numerical 0 probability for all classes with observation 6Numerical 0 probability for all classes with observation 7Numerical 0 probability for all classes with observation 8Numerical 0 probability for all classes with observation 9Numerical 0 probability for all classes with observation 10Numerical 0 probability for all classes with observation 11Numerical 0 probability for all classes with observation 12Numerical 0 probability for all classes with observation 13Numerical 0 probability for all classes with observation 14Numerical 0 probability for all classes with observation 15Numerical 0 probability for all classes with observation 16Numerical 0 probability for all classes with observation 17Numerical 0 probability for all classes with observation 18Numerical 0 probability for all classes with observation 19Numerical 0 probability for all classes with observation 20Numerical 0 probability for all classes with observation 21Numerical 0 probability for all classes with observation 22Numerical 0 probability for all classes with observation 23Numerical 0 probability for all classes with observation 24Numerical 0 probability for all classes with observation 25Numerical 0 probability for all classes with observation 26Numerical 0 probability for all classes with observation 27Numerical 0 probability for all classes with observation 28Numerical 0 probability for all classes with observation 29Numerical 0 probability for all classes with observation 30Numerical 0 probability for all classes with observation 31Numerical 0 probability for all classes with observation 32Numerical 0 probability for all classes with observation 33Numerical 0 probability for all classes with observation 34Numerical 0 probability for all classes with observation 35Numerical 0 probability for all classes with observation 36Numerical 0 probability for all classes with observation 37Numerical 0 probability for all classes with observation 38Numerical 0 probability for all classes with observation 39Numerical 0 probability for all classes with observation 40Numerical 0 probability for all classes with observation 41Numerical 0 probability for all classes with observation 42Numerical 0 probability for all classes with observation 43Numerical 0 probability for all classes with observation 44Numerical 0 probability for all classes with observation 45Numerical 0 probability for all classes with observation 46Numerical 0 probability for all classes with observation 47Numerical 0 probability for all classes with observation 48Numerical 0 probability for all classes with observation 49Numerical 0 probability for all classes with observation 50Numerical 0 probability for all classes with observation 51Numerical 0 probability for all classes with observation 52Numerical 0 probability for all classes with observation 53Numerical 0 probability for all classes with observation 54Numerical 0 probability for all classes with observation 55Numerical 0 probability for all classes with observation 56Numerical 0 probability for all classes with observation 57Numerical 0 probability for all classes with observation 58Numerical 0 probability for all classes with observation 59Numerical 0 probability for all classes with observation 60Numerical 0 probability for all classes with observation 61Numerical 0 probability for all classes with observation 62Numerical 0 probability for all classes with observation 63Numerical 0 probability for all classes with observation 64Numerical 0 probability for all classes with observation 65Numerical 0 probability for all classes with observation 66Numerical 0 probability for all classes with observation 67Numerical 0 probability for all classes with observation 68Numerical 0 probability for all classes with observation 69Numerical 0 probability for all classes with observation 70Numerical 0 probability for all classes with observation 71Numerical 0 probability for all classes with observation 72Numerical 0 probability for all classes with observation 73Numerical 0 probability for all classes with observation 74Numerical 0 probability for all classes with observation 75Numerical 0 probability for all classes with observation 76Numerical 0 probability for all classes with observation 77Numerical 0 probability for all classes with observation 78
# Model n accuracy

mean(predicted.classes.naive2 == test.pima.indians$diabetes)
[1] 0.8076923

Support Vector Machine

Support Vector Machine (or SVM) is a machine learning technique used for classification tasks. Briefly, SVM works by identifying the optimal decision boundary that separates data points from different groups (or classes), and then predicts the class of new observations based on the separation boundary.

Depending on the situations, the different groups might be separable by a linear straight line or by a non-linear boundary line.

Support vector machine methods can handle both linear and non-linear class boundaries. It can be used for both two-classs and multi-class classification problems.

In real life data, the separation boundary is generally nonlinear. Technically, the SVM algorithm perform a non-linear classification using what is called the kernel trick. The most commonly used kernel transformations ar e_polynomial kernel_ and radial kernel

Note that, there is also an extension of the SVM for regression, called support vector regression.

Loading required R packages

library(tidyverse)
library(caret)

Dataset

Data set: PimaIndiansDiabetes2 [in mlbench package]

# Load the data
data("PimaIndiansDiabetes2", package = "mlbench")
pima.data.cleaned <- na.omit(PimaIndiansDiabetes2)

# Split the data into training and test set
set.seed(123)

t.sample <- pima.data.cleaned$diabetes %>% createDataPartition(p = 0.8, list = FALSE)

SVM linear classifier

In the following example variables are normalized to make their scale comparable. This is automatically done before building the SVM classifier by setting the option preProcess = c(“center”, “scale”).

# Fit the model on the training set
set.seed(123)

model.svm <- train(
  diabetes~., data = train.pima.indians, method = "svmLinear",
  trContro = trainControl("cv", number = 10),
  preProcess = c("center", "scale")
)

# Make predictions on the test data
predicted.classes.svm <- model.svm %>% predict(test.pima.indians)
head(predicted.classes.svm)
[1] neg pos neg pos pos neg
Levels: neg pos
# Computed model accuracy rate
mean(predicted.classes.svm == test.pima.indians$diabetes)
[1] 0.7820513

Note that, there is a tuning parameter C, also known as Cost, that determines the possible misclassifications. It essentially imposes a penalty to the model for making an error. The higher the value of C, the less likely it is that the SVM algorithm will misclassify a point.

By default caret builds the SVM linear classifier usin g C= 1. You can check this by typing model in R console

model.svm
Support Vector Machines with Linear Kernel 

314 samples
  8 predictor
  2 classes: 'neg', 'pos' 

Pre-processing: centered (8), scaled (8) 
Resampling: Bootstrapped (25 reps) 
Summary of sample sizes: 314, 314, 314, 314, 314, 314, ... 
Resampling results:

  Accuracy   Kappa    
  0.7862095  0.4893315

Tuning parameter 'C' was held constant at a value of 1

The following R code compute SVM for a grid values of C and choose automatically the final model for predictions:

# Fit the model on the training set
set.seed(123)

model.svm2 <- train(
  diabetes ~., data = train.pima.indians, method = "svmLinear",
  trControl = trainControl("cv", number = 10),
  tuneGrid = expand.grid(C = seq(0.1, 2, length = 20)),
  preProcess = c("center", "scale")
)

# Plot model accuracy vs different values of Cost
plot(model.svm2)

# Print the best tuning parameter C that maximizes model accuracy
model.svm2$bestTune

Let’s use the best model

# Make predictions on the test data 
predicted.classess.svm2 <- model.svm2 %>% predict(test.pima.indians)

# Compute model accuracy rate
mean(predicted.classess.svm2 == test.pima.indians$diabetes)
[1] 0.7820513

SVM classifier using Non-Linear Kernel

To build a non-linear SVM classifier, we can use either polynomial kernel or radial kernel function.

  • Computing SVM using radial basis kernel
# Fit the model on the training set
set.seed(123)
model.svm.radial <- train(
  diabetes~., data = train.pima.indians, method = "svmRadial",
  trControl = trainControl("cv", number = 10),
  preProcess = c("center", "scale"),
  tuneLength = 10
)

# Print the best tuning parameter sigma and C that maximizes model accuracy

model.svm.radial$bestTune
# Make predictions on the test data
predicted.classess.svm.radial <- model.svm.radial %>% predict(test.pima.indians)
# Compute model Accuracy rate
mean(predicted.classess.svm.radial == test.pima.indians$diabetes)
[1] 0.7948718
  • Computing SVM using polynomial basis kernel:
# Fit the model on the training set
set.seed(123)
model.svm.polynomial <- train(
  diabetes~., data = train.pima.indians, method = "svmPoly",
  trControl = trainControl("cv", number = 10),
  preProcess = c("center", "scale"),
  tuneLength = 4
)

# Print the best tuning parameter sigma and C that maximizes model accuracy

model.svm.polynomial$bestTune
# Make predictions on the test data
predicted.classess.svm.polynomial <- model.svm.polynomial %>% predict(test.pima.indians)
# Compute model Accuracy rate
mean(predicted.classess.svm.polynomial == test.pima.indians$diabetes)
[1] 0.7948718

In our examples, it can be seen that the SVM classifier using non-linear kernel gives a better result compared to the linear model.

Classification Model Evaluation

After building a predictive classification model, you need to evaluate the performance of the model, that is how good the model is in predicting the outcome of new observations test data that have been not used to train the model.

The common use metrics and methods for assessing the performance of predictive classification models:

  • Average classification accuracy - representing the proportion of correctly classified observations.

  • Confusion matrix - which is 2x2 table showing four parameters, including the number of true positives, true negatives, false negatives and false positives.

  • Precision, Recall and Specificity - which are three major performance metrics describing a predictive classification model

  • ROC curve - which is a graphical summary of the overall performance of the model, showing the proportion of true positives and false positives at tall possible values of probability cutoff. The Area Under the Curve(AUC) summarizes the overall performance of the classifier.

Required R packages

  • tidyverse for easy data manipulation and visualization
  • caret for easy machine learning workflow
library(tidyverse)
library(caret)

Building a classification model

To keep things simple we’ll perform a binary classification, where the outcome variable can have only two possible values: negative vs positive.

We’ll compute an example of linear discriminant analysis model using the PimaIndiansDiabetes2 [mlbench package]

  1. Split the data into training (80%, used to build the model) and test set (20%, used to evaluate the model performance):
data("PimaIndiansDiabetes2", package = "mlbench")
pima.data.cleaned <- na.omit(PimaIndiansDiabetes2)
# Split the data into training and test set
set.seed(123)

t.sample <- pima.data.cleaned$diabetes %>% createDataPartition(p = 0.8, list = FALSE)

train.pima2 <- pima.data.cleaned[t.sample, ]
test.pima2 <- pima.data.cleaned[-t.sample, ]
  1. Fit the LDA model on the training set and make preditions on the test data:
library(MASS)
#Fit LDA

fit.lda <- lda(diabetes ~., data = train.pima2)

# Make predictions on the test data
predictions.pima2 <- predict(fit.lda, test.pima2)
preictions.probabilities2 <- predictions.pima2$posterior[,2]
predicted.classess2 <- predictions.pima2$class
observed.classes2 <- test.pima2$diabetes

Overall classification accuracy

The overall classification accuracy rate corresponds to the proportion of observations that have been correctly classified. Determining the raw classification accuracy is the first step in assessing the performance of a model.

accuracy <- mean(observed.classes2 == predicted.classess2)
accuracy
[1] 0.8076923
error <- mean(observed.classes2 != predicted.classess2)
error
[1] 0.1923077

From the output avove, the linear discrimant analysis correctly predicted the individual outcome in 81% of the cases. This is by far better than random guessing. The misclassification error rate can be calculated as 100-81 = 19%

In our example, a binary classifier can make two types of errors:

  • it can incorrectly assign an individual who is diabetes-positive to the diabetes-negative category
  • it can incorrectly assign an individual who is diabetes-negative to the diabetes-positive category.

The proportion of these two types of errors can be determined by creating a confution matrxi, which compare the predicted coutcome values agains the known outcome vaues.

Confusion matrix

The R function table() can be used to produce a confusion matrix in order to determin how many observations were correctly or incorrectly classified. It compares the observed and the predicted outcome values and shows the number of correct and incorrect predictions categorized by type of outcome.

# Confusion matrix, number of cases
table(observed.classes2, predicted.classess2)
                 predicted.classess2
observed.classes2 neg pos
              neg  48   4
              pos  11  15
# Confusion matrix, proportion of cases
table(observed.classes2, predicted.classess2) %>% prop.table() %>% round(digits = 3)
                 predicted.classess2
observed.classes2   neg   pos
              neg 0.615 0.051
              pos 0.141 0.192

The diagonal elements of the confusion matrix indicate correct predictions, while the off diagonals represent incorrect prediction. So, the correct classification rate is the sum of the number on the diagonal divided by the sample size in the test data. In our example, that is (48+15)/78 = 81%

  • True positives (d): these are cases in which we predicted the individuals would be diabetes-positive and they were .
  • True negatives (a): We predicted diabetes-negative, and the individuals were diabetes-negative
  • False positives (b): We predicted diabetes-positive, but the individuals didn’t actually hae diabetes. (Alsow known as a Type I error.)
  • False negatives (c): We predicted diabetes-negative, but they did have diabetes. (Also known as a Type II error.)

Technically the raw prediction accuracy of the model is defined as (TruePositives + TrueNegatives)/SampleSize.

Precision, Recall and Specificity

In addition to the raw classification accuracy, there are many other metrics that are widely used to examine the performance of a classification model, including:

Precision, which is the proportion of true positives among all the individuals that have been predicted to be diabetes-positive by the model. This represents the accuracy of a predicted positive outcome. Precision = TruePositives/(TruePositives + FalsePOsitives)

Sensitivity (or Recall), which is the True Positive Rate (TPR) or the proportion of identified positives among the diabetes-positive population (class = 1). Sensitivity = TruePOsitives/(TruePositives + FalseNegatives).

Specificity, which measures the True Negative Rate (TNR), that is the proportion of identified negatives among the diabetes-negative population (class = 0). Specificity = TrueNegatives/(TrueNegatives + FalseNegatives).

False POsitive Rate (FPR), which represents the proportion of identified positives among the healthy individuals (i.e. diabetes-negative). This can be seen as a false alarm. The FPR can be also calculated as 1-specificity. When positives are rare, the FPR can be high, leading to the situation where a predicted positive is most likely a negative.

confusionMatrix(observed.classes2, predicted.classess2, positive = "pos")
Confusion Matrix and Statistics

          Reference
Prediction neg pos
       neg  48   4
       pos  11  15
                                          
               Accuracy : 0.8077          
                 95% CI : (0.7027, 0.8882)
    No Information Rate : 0.7564          
    P-Value [Acc > NIR] : 0.1787          
                                          
                  Kappa : 0.5361          
                                          
 Mcnemar's Test P-Value : 0.1213          
                                          
            Sensitivity : 0.7895          
            Specificity : 0.8136          
         Pos Pred Value : 0.5769          
         Neg Pred Value : 0.9231          
             Prevalence : 0.2436          
         Detection Rate : 0.1923          
   Detection Prevalence : 0.3333          
      Balanced Accuracy : 0.8015          
                                          
       'Positive' Class : pos             
                                          

In our example, the sensitivity is ~58%, that is the proportion of diabetes-positive individuals that were correctly identified by the model as diabetes-positive. The specificity of the model is ~92%, that is the proportion of diabetes-negative individuals that were correctly identified by the model as diabetes-negative. The model precision or the proportion of positive predicted value is 79%.

ROC curve

The ROC curve (or receiver operating characteristics curve) is apopular graphical measure for assessing the performance or the accuracy of a classifier, which corresponds to the total

Computing and plotting ROC curve

The ROC analysis can be easily performed using the R package pROC

library(pROC)
res.roc <- roc(observed.classes2,preictions.probabilities2)
Setting levels: control = neg, case = pos
Setting direction: controls < cases
plot.roc(res.roc, print.auc = TRUE)

# Extract some interesting results
roc.data <- data_frame(
  thresholds = res.roc$thresholds,
  sensitivity = res.roc$sensitivities,
  specificity = res.roc$specificities
)
`data_frame()` is deprecated, use `tibble()`.
This warning is displayed once per session.
# Get the probability threshold for specificity = 0.6

roc.data %>% filter(specificity >= 0.6)
plot.roc(res.roc, print.auc = TRUE, print.thres = "best")

plot.roc(res.roc, print.thres = c(0.3,0.5,0.7))

---
title: "ML Essentials"
output: html_notebook
date: Nov 28, 2020
---

# output:
#   md_document:
#     variant: markdown_github

```{r}
setwd("~/HomeWork/Practice")
```


```{r}
devtools::install_github("kassambara/datarium")
```

```{r}
data("marketing", package="datarium")
head(marketing)
```


```{r}
data("swiss")
head(swiss)
```

```{r}
data("Boston", package="MASS")
head(Boston)
```

### Chapter 3 Regression

Loading Required R packages
```{r}
library(tidyverse)
library(caret)
theme_set(theme_bw())
```
Preparing the data

```{r}
# Load the data
data("marketing", package = "datarium")
# Inspect th data
sample_n(marketing,3)
```

Split the data into training and test set
```{r}
set.seed(123)
training.samples <- marketing$sales %>% createDataPartition(p=0.8, list = FALSE)

train.data <- marketing[training.samples,]
test.data <- marketing[-training.samples,]
```
```{r}
summary(train.data)
dim(train.data)

dim(test.data)
```

Computing linear regression in r

```{r}
model <- lm(sales ~., data = train.data)
summary(model)

# Make predictios
predictions <- model %>% predict(test.data)

# Model Performance
# (a) Prediction error, RMSE
RMSE(predictions, test.data$sales)
# (b) R-square
R2(predictions, test.data$sales)

```

## Non-Linear Regression

```{r}
data("Boston", package = "MASS")
set.seed(123)

head(Boston)
```

```{r}
training1.samples <- Boston$medv %>% createDataPartition(p=0.8, list = FALSE)

train1.data <- Boston[training.samples,]
test1.data <- Boston[-training.samples, ]

ggplot(train1.data, aes(lstat, medv)) +
  geom_point() +
  stat_smooth()
```

Let's start with fitting a linear regression model
```{r}
model.lm <- lm(medv~lstat, data = train1.data)
summary(model.lm)
prediction <- model.lm %>% predict(test1.data)

data.frame(RMSE = RMSE(prediction, test1.data$medv),
           R2 = R2(prediction, test1.data$medv))
```

Let's visualize the data
```{r}
ggplot(train1.data, aes(lstat, medv)) +
  geom_point() +
  stat_smooth(method = lm, formula = y ~ x)
```


### Polynomial regression



medv = b0 + b1 ∗ lstat + b2 ∗ lstat2


```{r}
mod2 <- lm(medv ~ lstat + I(lstat*lstat), data = train1.data)
summary(mod2)
prediction2 <- mod2 %>% predict(test1.data)
data.frame(
  RMSE = RMSE(prediction2, test1.data$medv),
  R2 = R2(prediction2, test1.data$medv)
)
mod22 <- lm(medv ~ poly(lstat,2), data = train1.data)
summary(mod22)
prediction22 <- mod22 %>% predict(test1.data)
data.frame(
  RMSE = RMSE(prediction22, test1.data$medv),
  R2 = R2(prediction22, test1.data$medv)
)
```

```{r}
ggplot(train1.data, aes(lstat, medv)) +
  geom_point() +
  stat_smooth(method = lm, formula = y ~ poly(x,2))
```

```{r}
ggplot(train1.data, aes(lstat, medv)) +
  geom_point() +
  stat_smooth(method = lm, formula = y ~ (x + I(x*x)))
```


```{r}
mod6 <- lm(medv ~ poly(lstat, 6), data = train1.data)
summary(mod6)
```

As we can see in above model summary the polynomial term beyond 5 is not significant

Let's visualize model

```{r}
ggplot(train1.data, aes(lstat, medv)) +
  geom_point() +
  stat_smooth(method = lm, formula = y ~ poly(x,6))
```


```{r}
mod5 <- lm(medv ~ poly(lstat, 5), data = train1.data)
summary(mod5)
```


```{r}
ggplot(train1.data, aes(lstat, medv)) +
  geom_point() +
  stat_smooth(method = lm, formula = y ~ poly(x,5))
```



### Using log transformation

```{r}
mod.log <- lm(medv ~ log(lstat), data = train1.data)
summary(mod.log)
prediction.log <- mod.log %>% predict(test1.data)

data.frame(
  RMSE = RMSE(prediction.log, test1.data$medv),
  R2 = R2(prediction.log, test1.data$medv)
)
```

```{r}
ggplot(data = train1.data, aes(lstat, medv))+
  geom_point() +
  stat_smooth(method = lm, formula = y~log(x))
```

### Spline Regression

```{r}
library(splines)
knots <- quantile(train1.data$lstat, p = c(0.25, 0.5, 0.75))
model.spline <- lm(medv ~bs(lstat,knots=knots), data = train1.data)
summary(model.spline)
prediction.spline <- model.spline %>% predict(test1.data)

data.frame(
  RMSE = RMSE(prediction.spline, test1.data$medv),
  R2 = R2(prediction.spline, test1.data$medv)
)
```



```{r}
ggplot(train1.data, aes(lstat, medv)) + 
  geom_point() + 
  stat_smooth(method = lm, formula = y~bs(x,df=3))
```


### Generalized additive models

```{r}
library(mgcv)

model.additive <- gam(medv ~ s(lstat), data = train1.data)
summary(model.additive)

prediction.additive <- model.additive %>% predict(test1.data)

data.frame(
  RMSE = RMSE(prediction.additive, test1.data$medv),
  R2 = R2(prediction.additive, test1.data$medv)
)
```


```{r}
ggplot(train1.data, aes(lstat, medv)) +
  geom_point() +
  stat_smooth(method = gam, formula = y ~ s(x))
```


## Regression Model Accuracy Metrics

### Model performance metrics

In regression model, the most commonly known evaluation metrics include:

1. __R-squared (R2)__, which is the proportion of variation in the outcome that is explained by the predictor variables. In multiple regression models, R2 corresponds to the squared correlation between the observed outcome values and the predicted values by the model.
The Higer the R-squared, the better the model.

2. __Root Mean Squared Error (RMSE)__, which measures the average error performed by the model in the predicting the outcome for an observation. Mathematically, the RMSE is the square root of the mean squared error (MSE), which is the average squared difference between the observed actual outcome values and the values predicted by the model. So, MSE = mean((observeds - predicteds)^2) and RMSE = sqrt(MSE). The lower the RMSE, the better the model.

3. __Residual Standard Error (RSE)__, also known as the _model sigma_, is a variant of the RMSE adjusted for the number of predictors in the model. The lower the RSE, the better the model. In practice, the difference between RMSE and RSE is very small, particularly for large multivariate data.

4. __Mean Absolute Error (MAE)__, like the RMSE, the MAE measures the prediction error. Mathematically, it is the average absolute difference between obsered and predicted out-comes, MAE = mean(abs(observeds - predicteds)) . MAE is less sensitive to outliers compared to RMSE.

The problem with the above metrics, is that they are sensible to the inclusion of additional variables in the model, even if those variables don't have significant contribution in explaining the outcome. Put in other words, including additional variables in the model will always increase the R2 and reduce the RMSE. So, we need a more robust metric to guide the model choice.

Concerning R2, there is an adjusted version, called Adjusted R-squared, which adjusts the R2 for having too many variables in the model.

Additionally, there are four other important metrics - AIC, AICc, BIC and Mallows Cp - tha are commonly used for model evaluation and selection. These are an unbiased estimate of the model prediction error MSE. The lower these metrics, he better the model.


1. __AIC__ stands for (Akike's Information Criteria), a metric developeed by the Japanese Statistician, Hirotugu Akaike, 1970. The basic idea of AIC is to penalize the inclusion of additional variables to a model. It adds a penalty that increases the error when including additional terms. The lowwer the AIC, the better the model.


2. __AICc__ is a version of AIC corrected for small sample sizes.

3. __BIC__ (or Bayesian information criteria) is a variant of AIC with a strong penalty for including additional variables to the model.

4. __Mallows Cp__: Avariant of AIC developed by Colin Mallows.

Generally, most commonly used metrics, for measuring regression model quality and comparing models, are: Adjusted R2, AIC, BIC and Cp


In the following sections, we'll sho you how to compute these above mentioned metrics.


```{r}
library(modelr)
library(broom)
```

```{r}
# Load the data
data("swiss")

# Inspect the data
sample_n(swiss,3)
```


### Building regression models

We start by creating two models:

1. Model 1, including all predictors
2. Model 2, including all predictors except the variable _Examination_


```{r}
model.swiss.1 <- lm(Fertility~., data = swiss)
summary(model.swiss.1)
```

```{r}

data.frame(
  R2 = rsquare(model.swiss.1, data = swiss),
  RMSE = rmse(model.swiss.1, data = swiss),
  MAE = mae(model.swiss.1, data = swiss),
  AIC = AIC(model.swiss.1),
  BIC = BIC(model.swiss.1)
)
```


```{r}
model.swiss.2 <- lm(Fertility~. - Examination, data = swiss)

summary(model.swiss.2)
```


```{r}
data.frame(
  R2 = rsquare(model.swiss.2, data = swiss),
  RMSE = rmse(model.swiss.2, data = swiss),
  MAE = mae(model.swiss.2, data = swiss),
  AIC = AIC(model.swiss.2),
  BIC = BIC(model.swiss.2)
)
```

```{r}
glance(model.swiss.1)
glance(model.swiss.2)
```


Manual computation of R2, RMSE and MAE

```{r}
swiss %>% add_predictions(model.swiss.1) %>% summarise(
  R2 = cor(Fertility, pred)^2,
  MSE = mean(Fertility - pred)^2,
  RMSE = sqrt(MSE),
  MAE = mean(abs(Fertility - pred))
)
```


* __Comparing regression models performance__

```{r}
glance(model.swiss.1) %>% select(adj.r.squared, sigma, AIC, BIC, p.value)
```


```{r}
glance(model.swiss.2) %>% select(adj.r.squared, sigma, AIC, BIC, p.value)
```

From the output above, it can be seen that:

1. The two models have exactly the samed *Adjusted R2* (0.67), meaning that they are equivalent in explaining the outcome, here fertility score. Additionally, they have the same amount of *residual standard error* (RSE or sigma = 7.17). However, the model 2 is more simple than model 1 because it incorporates  less variables. All things equal, the simple model is always better in statistics.

2. The AIC and the BIC of the model 2 are lower than those of the model1. In model comparison strategies, the model with the lowest AIC and BIC score is preferred.

3. Finally, the F-statistic p.value of the model 2 is lower than the one of the model 1. This means that the model 2 is statistically more significant compared to model 1, which is consistent to the avove conclusion.


Note that, the RMSE and the RSE are measured in the same scale as the outcome variable.
Dividing the RSE by the average value of the outcome variable will give you the prediction error rate, which should be as small as possible:

```{r}
sigma(model.swiss.1)/mean(swiss$Fertility)
```

In our example the average prediction error rate is 10%

## Cross-validation

**Cross-validation** refers to a set of methods for measuring the performance of a given predictive model on new test data sets.

The basic idea, behind cross-validation techniques, consists of dividing the data into two sets:

1. The training set, used to train (i.e. build) the model;
2. and the testing set (or validation set), used to test (i.e. validate) the model by estimating the prediction error.

Cross-validation is also known as a _resampling method_ because it involves fitting the same statistical method multiple times using different subsets of the data.


1. The different cross-validation methods for assessing model performance. We cover the following approches:

    * validation set approach (or data split)
    * Leave On Out Cross Validation
    * k-fold Cross Validation
    * Repeated k-fold Cross Validation
 

## Model performance metrics

After building a model, we are interested in determining the accuracy of this model on predicting the outcome for new unseen observations not used to build the model. Put in other words, we want to estimate the prediction error.

To do so, the basic strategy is to:

1. Build the model on a training data set
2. Apply the model on a new test data set to make predictions
3. Compute the prediction errors


### Cross-validation methods

Briefly, cross-validation algorithms can be summarized as follow:

1. Reserve a small sample of the data set
2. Build (or train) the model using the remaining part of the data set 
3. Test the effectiveness of the model on the reserved sample of the data set. If the model works well on the test data set, then it's good.

The following sections describe the diffferent cross-validation techniques.


#### 1. The Validation set Approach

The validation set approach consists of randomly splitting the data into two sets: one set is used to train the model and the remaining other set is used to test the model.

The process works as follow:

1. Build (train) the model on the training data set
2. Appply the model to the test data set to predict the outcome of new unseen observations
3. Quantify the prediction error as the mean squared difference between the observed and the predicted outcome values.

The example below splits the swiss data set so that 80% is used for training a linear regression model and 20% is used to evaluate the model performance.

```{r}
# Split the data into training and test set 
set.seed(123)

training.samples <- swiss$Fertility %>%
  createDataPartition(p=0.8, list = FALSE)

train.swiss.data <- swiss[training.samples, ]
test.swiss.data <- swiss[-training.samples, ]

# Build the model

model.swiss.lm <- lm(Fertility ~ ., data = train.swiss.data)

# Make predictions and compute the R2, RMSE and MAE
predictions.swiss.lm <- model.swiss.lm %>%  predict(test.swiss.data)

data.frame(
  R2 = R2(predictions.swiss.lm, test.swiss.data$Fertility),
  RMSE = RMSE(predictions.swiss.lm, test.swiss.data$Fertility),
  MAE = MAE(predictions.swiss.lm, test.swiss.data$Fertility)
)
```


When comparing two models, the one that produces the lowest test sample RMSE is the preferred model.

The RMSE and the MAE are measured in the same scale as the outcome variable. Dividing the RMSE by the average value of the outcome variable will give you the prediction error rate, which sould be as small as possible:

```{r}
RMSE(predictions.swiss.lm, test.swiss.data$Fertility)/mean(test.swiss.data$Fertility)
```

Note that, the validation set method is only useful when you have a large data set that can be partitioned. A disadvantage is that we build a model on a fraction of the data set only, possibly leaving out some interesting information about data, leading to higher bias. Therefore, the test error rate can be highly variable, depending on which observations are included in the training set and which observation are included in the validation set.


#### 2. Leave one out cross validation - LOOCV

This method works as follows:
1. Leave out one data point and build the model on the rest of the data set
2. Test the model against the data point that is left out at step 1 and record the test error associated with the prediction
3. Repeat the process for all data points
4. Compoute the overall prediction error by taking the average of all these test error estimates recoreded at step3.

Practical example in R using the caret package:


```{r}
# Define training control
train.control <- trainControl(method = "LOOCV")

# Train the model

model.loocv <- train(Fertility ~., data = swiss, method = "lm", trControl = train.control)
# Summarize the results
print(model.loocv)
```


The advantage of the LOOCV method is that we make use all data points reducing potential bias.

However, the process is repeated as many times as there are data points, resulting to a higher exection time when n is extreamely large.

Additionally, we test the model performance against one data, point at each iteration. This might result to higher variation in the prediction error, if some data points are outliers. So, we need a good ratio of testing data points, a solution provided by the **k-fold cross-validation method**.

#### K-fold cross-validation

The k-fold cross validation method evaluates the model performance on different subset of the training data and then calculate the average prediction error rate. The algorithm is as follow:

1. Randomly split the data into k-subset (or k-fold) (for example 5 subsets)
2. Reserve on subset and train the model on all other subsets
3. Test the model on the reserved subset and record the prediction error
4. Repeat this procss untill each of the k subsets has served as the test set.
5. Compute the average of the k recorded errors. This is called the cross validation error serving as the performance metric for the model

K-fold cross-validation (CV) is a robust method for estimating the accuracy of a model.
The most obvious advantage of k-fold CV compared to LOOCV is computational. A less obvious but potentially more important advantage of k-fold CV is that it often fives more accurate estimates of the test error rate than does LOOCV


> In practice, one typically performs k-fold cross-validation using k = 5 or k = 10, as these values have been shown empirically to yield test error rate estimates that suffer neither from excessively high bias nor from very high variance.

The following example uses 10-fold cross validation to estimate the prediction error. Make sure to set seed for reproducibility.


```{r}
# Define training control
set.seed(123)

train.control <- trainControl(method = "cv", number = 10)

# Train the model
model.cv <- train(Fertility~., data = swiss, method = "lm", trControl = train.control)

# Summarize the results
print(model.cv)
```

#### 4. Repeated K-fold cross-validation

The process of splitting the data into k-folds can be repeated a number of times, this is called repeated k-fold cross validation.


### Bootstrap procedure

The bootstrap method is used to quantify the uncertainty with a given statistical estimator or with a predictive model.

It consists of randomly selecting a sample of n observations from the original data set. This subset, called bootstrap data set is then used to evaluate the model.

This procedure is repeated a large number of times and the standard error of the bootstrap estimate is then calculated. The results provide an indication of the variance of the model performance.

Note that, the sampling is performed with replacement, which means that the same observation can occur more than once in the boostrrap data set.

#### Evaluating a predictive model performance


```{r}
# Define training control

train.control.bootstrap <- trainControl(method = "boot", number = 100)

model.bootstrap <- train(Fertility ~., data = swiss, method = "lm", trControl = train.control.bootstrap)

# Summarize the results
print(model.bootstrap)
```


### Quantifying an estimator uncertainty and confidence intervals

The bootstrap approach can be used to quantify the uncertainty (or standard error) associated with any given statistical estimator.

For example, you might want to estimate the accuracy of the linear regression beta coefficients using bootstrap method.

The different steps are as follows:

1. Create a simple function, model_coef(), that takes the **swiss** data set as well as the indices for the observations, and returns the regression coefficients.
2. Apply the function boot_fun() to the full data set of 47 observations in order to compute the coefficients

We start by creating a function that retrns the regression model coefficeients:


```{r}
model_coef <- function(data, index) {
  coef(lm(Fertility~., data = data, subset = index))
}

model_coef(swiss, 1:47)
```


```{r}
library(boot)
boot(swiss, model_coef, 500)
```

In the output above,

* **original** column corresponds to the regression coefficients. The associated standard errors are given in the column **std.error** .

* **t1** corresponds to the intercept, **t2** conrresponds to *Agriculture** and so on ..

For example, it can be observe that, the standard error (SE) of the regression coefficient associated with **Agriculture** is 0.06

Note that, the standard errors measure the variability/accuracy of the beta coefficients. It can be used to compute the confidence intervals of the coefficients.

For example, the 95% confidence interval for a given coefficient b is defined as **b+/-1.96\*SE(b)**,
where:

* The lower limits of b = b-1.96\*SE(b) = -0.172 - (1.96\*0.0680) = -0.306(for Agriculture variable)
* The Upper limits of b = b+1.96\*SE(b) = -0.172 + (1.96\*0.0680) = -0.03743121(for Agriculture variable)

That is, there is approximately a 95% chance that the interval[-0.306,-0.036] will contain the true value of the coefficient.

Using the standard lm() function gives a slightly different standard errors, because the linear model make some assumptions about the data:


```{r}
summary(lm(Fertility ~., data = swiss))$coef

```


The bootstrap approach does not rely on any of these assumptions made by the linear model and so it is likely giving a more accurate estimate on the coefficients standard errors than tis the summary() function.


## Model Selection

## Best Subsets Regression

```{r}
library(tidyverse)
library(caret)
library(leaps)
```


### Computing best subset regression

The R function regsubsets() [leaps package] can be used to identify different best models of different sizes.

You need to specify the option **nvmax**, which represents the maximum number of predictors to incorporate in the mode. For example, if nvmax = 5, the function will return up to the best 5-variables model, that is, it returns the best 1-varable model, the best 2-variables model, .., the best 5-variables models.

In our example, we have only 5 predictor variables in the data. So, we'll use nvmax = 5




```{r}
models.regsubsets <- regsubsets(Fertility~., data= swiss, nvmax = 5)
summary(models.regsubsets)
```


### Model selection criteria: Adjusted R2, Cp and BIC

The **summary()** function returns some metrics - Adjusted R2, Cp and BIC allowing us to identify the best overall model, where best is defined as the model that maximize the adjusted R2 and minimize the prediction errors

The adjusted R2 represents the proportion of variation, in the outcome, that are explained by variation in predictors values. The higher the adjusted R2, the better the model.

The best model, according to each of these metrics, can be extracted as follow:


```{r}
res.sum <- summary(models.regsubsets)

data.frame(
  Adj.R2 = which.max(res.sum$adjr2),
  CP = which.min(res.sum$cp),
  BIC = which.min(res.sum$bic)
)

```



There is no single correct solution to model selection, each of these criteria will lead to slightly different models.


Remembert that, 

>"all models are wrong, some models are useful"

So, we have different "best" models depending on which metrics we consider. We need additonal strategies.

Note also that the adjusted R2, BIC and Cp are calculated on the training data that have been used to fit the model. This means that, the model selection, using these metrics, is possibly subject to overfitting and may not perform as well when applied to new data.

A more rigorous approach is to select a models based on the prediction error computed on a new test data usin k-fold cross-validation techniques

### K-fold cross-validation 

Here, we'll follow the procedure below:

1. Extract the different model formulas from the models object
2. Train a linear model on the formula using k-fold cross-validation(with k=5) and compute the prediction error of each model

We start by defining two helper function:

1. get_model_formula(), allowing to access easily the formula of the models returned by the function **regsubsets()**.


```{r}
#id: model id
#object: regsubsets object
#data: data used to fit regsubsets

get_model_formula <- function(id, object){
  models <- summary(object)$which[id,-1]
  #get outcome variable
  
  form <- as.formula(object$call[[2]])
  outcome <- all.vars(form)[1]
  #Get model predictors
  predictors <- names(which(models == TRUE))
  predictors <- paste(predictors, collapse = "+")
  # Build model formula
  as.formula(paste(outcome, "~", predictors))
}
```


```{r}
get_model_formula(3,models.regsubsets)

```


2. get_cv_error(), to get the cross-validation (CV) error for a given model:

```{r}
get_cv_error <- function (model.formula, data){
  set.seed(1)
   train.control <- trainControl(method = "cv", number = 5)
   cv <- train(model.formula, data = data, method = "lm", trControl = train.control)
   cv$results$RMSE
}
```

Use the above defined method to compute the prediction error


```{r}
# Compute cross-validation error

model.ids <- 1:5
cv.errors <- map(model.ids, get_model_formula, models.regsubsets) %>%
  map(get_cv_error, data = swiss) %>%
  unlist()
cv.errors

```

```{r}
# Select the model that minimize the CV error
which.min(cv.errors)

```

It can be seen that the model with 4 variables is the best model. It has the lower prediction error. The regression coefficients of this model can be extracted as follow:

```{r}
coef(models.regsubsets, 4)
```


## Stepwise Regression

1. **Forward selection**: Which starts with no predictors in the model, Iteratively adds the most contributive predictor, and stops when the improvement is no longer statistically significant.

2. **Backwoard selection(or backward elimination)**: which starts with all predictors in the model (full model), iteratively removes the least contributive predictors, and stops when you have a model where all predictors are statistically significant.

3. **Stepwise selection (or sequential replacement)** which is a combination of forward and backward selections. you start with no predictors, then sequentially add the most contributive predictors (like forward selection). After adding each new variable, remove any variables that no longer provide and improvement in the model fit

```{r}
library(tidyverse)
library(caret)
library(leaps)
```


### Computing setpwise regression

```{r}
library(MASS)
# Fit the full model
full.model <- lm(Fertility ~., data = swiss)

# Stepwise regression model
step.model <- stepAIC(full.model, direction = "both", trace = FALSE)

summary(step.model)
```


## Penalized Regression: Ridge, Lasso and Elastic Net


### Shrinkage methods

### Ridge regression

Ridge regression shrinks the regression coefficients, so that varibles, with minor contribution to the outcome, have their coefficients close to zero.

The shrinkage of the coefficients is achieved by penalizing the regression model with a penalty term called **L2-norm**, which is the sum of the squared coefficients.

The amount of the penalty can be fine-tuned using a constant called lambda . Selecting a good value for lambda is critical

When lambda = 0, the penalty term has no effect, and ridge regression will produce the calssical least square coefficients. However, as lambda increases to infinite, the impact of the shrinkage penalty grows, and the ridge regression coefficients will get close zero.

> Note that, in contrast to the ordinary least square regression, ridge regression is highly affected by the scale of the predictors. Therfore, it is better to standardize (i.e., scale) the predictors before applying the ridge regression, so that all the predictors are on the same scale.

The standardization of a predictor x, can be achieved using the formula x` = x/sd(x), where sd(x) is the stadard deviation of . The consequence of this is that, all standardized predictors will have a standard deviation of one allowing the final fit to not depend on the scale on which the predictors are measured.

One important advantage of the ridge regression, is that it still perfroms will, compared to the ordinary least square method, in a situation where you have a large multivariate data with the number of predictors (p) larger than the number of observations (n).

One disadvantage of the ridge regression is that, it will included all the predictors in the final model, unlike the stepwise regression methods, which will genrally select models that involve a reduced set of variables.

Ridge regression shrinks the coefficients towards zero, but it will not set any of them exactly to zero. The lasso regression is an alternative that overcomes this drawback.


### Lasso regression

Lasso stands for **Least Absolute Shrinkage and Selection Operator**. It shrinks the regression coefficients toward zero by penalizing the regression model with a penalty term called **L1-norm**, which is the sum of the absolute coefficients.

In the case of lasso regression, the penalty has the effect of forcing some of the coefficient estimates, with a minor contribution to the model, to be ecactly equal to zero. This means that, lasso can be also seen as an alternative to the subset selection methods for performing variable selection in order to reduce the complexity of the model.

As in ridge regression, selecting a good value of lambda for the lasso is critical.

One obvious advantage of lasso regression over ridge regression, is that it produces simpler and more interpretable models that incorporate only a reduced set of the predictors. However, neither ridge regression nor the lasso will universally dominate the orther.

Generally, lasso mght perform better in a situation where some of the predictors have large coefficients, and the remaining predictors have very small coefficients.

Ridge regression will perform better when the outcome is a function of many predictors, all with coefficents of roughly equal size.

Cross-validation methods can be used for identifying which of these two techniques is better on a particular data set.



### Elastic Net

Elastic Net produces a regression model that is penalized with both the **L1-norm** and **L2-norm**. The consequence of this is to effectively shrink coefficents (Like in ridge regression) and to set some coefficients to zero (as in LASSO).

#### Loading required R packages


```{r}
library(tidyverse)
library(caret)
library(glmnet)

```


Preparing the data

We'll use the Boston data set [in MASS package], for predicting the median house value(mdev), in Boston Suburbs, based on multiple predictor variables.


```{r}
# Load the data
data("Boston", package = "MASS")

# Split the data into training and test set
set.seed(123)
boston.samples <- Boston$medv %>%
  createDataPartition(p=0.8, list = FALSE)

train.boston.data <- Boston[boston.samples, ]
test.boston.data <- Boston[-boston.samples, ]
```


#### Computing penalized linear regression

You need to create two objects:

* y for storing the outcome variable
* x for holding the predictor variables.

```{r}
x <- model.matrix(medv~., train.boston.data)[,-1]
# Outcome variable
y <- train.boston.data$medv
```

We'll use the R function glmnet() fro computing penalized linear regression models.


```{r}
glmnet(x,y, alpha = 1, lambda = NULL)
```

* x: matrix of predictor variables
* y: the response or outcome variable, which is a binary variable.
* alpha: the elasticnet mixing parameter. Allowed values include:
   - "1": for lasso regression
   - "0": for ridge regression
   - a valued between 0 and 1 (say 0.25) for elastic net regression.
   - lambda: a numeric value defining the amount of shrinkage. Should be specify by analyst


In penalized regression, you need to specify a constant lambda to adjust the amount of the coefficent shrinkage. The best lambda for your data, can be defined as the lambda that minimize the cross-validation prediction error rate. This can be determined automatically using the function cv.glmnet().

>In the following section, we start by computing ridge, lasso and elastic net regresion models. Next, we'll compare the different models in order to choose the best one for our data
The best model is defined as the model that has the lowest prediction error, RMSE

Computing ridge regression

```{r}
# Find the best lambda using cross-validation
set.seed(123)
cv <- cv.glmnet(x,y, alpha = 0)

# Display the best lambda value
cv$lambda.min
```


```{r}
# Fit the final model on the training data
model.ridge <- glmnet(x,y, alpha = 0, lambda = cv$lambda.min)

# Display regression coefficients
coef(model.ridge)

```



```{r}
# Make predictions on the test data
x.test <- model.matrix(medv~., test.boston.data)[,-1]
predictions.ridge <- model.ridge %>% predict(x.test) %>% as.vector()

# Model performance metrics

data.frame(
  RMSE = RMSE(predictions.ridge, test.boston.data$medv),
  Rsquare = R2(predictions.ridge, test.boston.data$medv)
)
```

> Note that by default, the function glmnet() standardizes variables so that their scales are comparable. However, the coefficients are always returned on the original scale.

Computing lasso regression

The only difference between the R code used for ridge regression is that for lasso regression you need to specify the argument alpha = 1 instead of alpha - 0 


```{r}
# Find the best lambda using cross-validation
set.seed(123)
cv <- cv.glmnet(x,y,alpha = 1)
# Display the best lambda value
cv$lambda.min
```

```{r}
# Fit the final model on the training data
model <- glmnet(x,y, alpha = 1, lambda = cv$lambda.min)
#Display regression coefficients
coef(model)
```


```{r}
# Make predictions on the test data
x.laso.test <- model.matrix(medv ~., test.boston.data)[,-1]
predictions.lasso <- model %>% predict(x.test) %>% as.vector()

# Model performance metrics
data.frame(
  RMSE = RMSE(predictions.lasso, test.boston.data$medv),
  Rsquare = R2(predictions.lasso, test.boston.data$medv)
)
```

### Computing elastic net regression

The elastic net regression can be easily computed using the caret workflow, which invokes the glmnet package.

We use caret to automatically select the best tuning parameters alpha and lambda. the caret packages tests a range of possible alpha and lambda values, then selects the best values fro lambda and alpha, resulting to a final modl that is an elastic net modle.

Here, we'll test the combination of 10 different values for alpha and lambda. This is specified using the option tuneLength.

The best alpha and lambda values are those values that minimize the cross-validation error


```{r}
# # Build the model using the training set
# set.seed(123)
# model.elastic <- train(
#   medv ~., data = train.boston.data, method = "glmnet",
#   trcontrol = trainControl("cv", number = 5),
#   tuneLength = 10
# )
# 
# # Best tuning parameter
# model.elastic$bestTune
```


## Principal Component and Partial Least Squares Regression


### Principal component regression

The principal component regression(PCR) first applies Principal Component Analysis on the data set to summarize the original predictor variables into few new variables also known as principal component (PCs), which are a linear combination of the original data.

These PCs are then used to build the linear regression model. The number of principal components, to incorporate in the model, is chosen by corss-validation (cv). Note that, PCR is suitable when the data set contains highly correlated predictors.

### Partial least squares regression

A possible drawback of PCR is that we have no guarantee that the selected principal components are associated with the outcome. Here, the selection of the principal components to incorporate in the model is not supervised by the outcome variable.

An alternative to PCR is the **Partial Least Squares** (PLS) regression, which identifies new principal components that not only summarizes the original predictors, but also that are related to the outcome. These components are then used to fit the regression model. So compared PCR, PLS uses a dimension reduction strategy that is supervised by the outcome.

Like PCR, PLS is convenient for data with highly-correlated predictors. The number of PCs used in PLS is generally chosen by cross-validation. Predictors and the outcome variables should be generally standardized, to make the variables comparable.


### Loading required R packages

* tidyverse for easy data manipulation and visualization
* caret for easy machine learning workflow
* pls, for computing PCR and PLS

```{r}
library(tidyverse)
library(caret)
library(pls)
```


### Preparing the data

We'll use the boston data set

#### Computing principal component regression


```{r}
# Build the model on training set

set.seed(123)

model.pc <- train(
  medv~., data = train.boston.data, method = "pcr",
  scale = TRUE,
  trControl = trainControl("cv", number = 10),
  tuneLength = 10
)

# Plot model RMSE vs different values of components

plot(model.pc)

#Print the best tunning parameter ncomp that 
# minimize the cross-validation error, RMSE
model.pc$bestTune
```



```{r}
summary(model.pc$finalModel)
```

```{r}
# Make predictions
predictions.pc <- model.pc %>% predict(test.boston.data)
# Model performance metrics

data.frame(
  RMSE = caret::RMSE(predictions.pc, test.boston.data$medv),
  Rsquare = caret::R2(predictions.pc, test.boston.data$medv)
)
```

The plot shows the prediction error made by the model acccording to the number of principal components incorporated in the model.

Our analysis shows that, choosing five principal components (ncomp = 5) gives the smallest prediction error RMSE.

The **summary()** function also provides the percentage of variance explained in the predictors(x) and in the outcome (medv) using different numbers of components.

For example, 80.94% of the variation (or information) contained in the predictors are captured by 5 components (ncomp = 5). Additionally, setting ncomp = 5, captures 71% of the information in the outcome variable (medv), which is good

> Taken together, cross-validation identifies ncomp - 5 as the optimal number of PCs that minimize the prediction error (RMSE) and explains enough variation in the predictors and in the outcome



### Computing partial least squares


```{r}
# Build the model on training set
set.seed(123)
model.pls <- train(
  medv~., data = train.boston.data, method = "pls",
  scale = TRUE,
  trControl = trainControl("cv", number = 10),
  tuneLength = 10
)

# Plot model RMSE vs Different values of components
plot(model.pls)
```


```{r}
model.pls$bestTune
```

```{r}
summary(model.pls$finalModel)
```

```{r}
#Make predictions
predictions.pls <- model.pls %>% predict(test.boston.data)

# Model performance metrics

data.frame(
  RMSE = caret::RMSE(predictions.pls, test.boston.data$medv),
  Rsquare = caret::R2(predictions.pls, test.boston.data$medv)
)
```


The optimal number of principal components included in the PLS model is 9. This captures 90% of the variation in the predictors and 75% of the variation in the outcome variable (medv).

In our example, the cross-validation error RMSE obtained with the PLS model is lower than the RMSE obtained using the PCR method. So, the PLS model is the best model, for explaining our data, compaed to the PCR model.





# Classification

In this part, we'll comver the follwing topics:

* Logistic regression, for binary classification tasks
* Stepwise and penalized logistic regression for vaiable selections
* Logistic regression assumptions and diagnostics
* Multinomial logistic regression, and extension of the logistic regression for multiclass classification tasks
* Discriminant analysis, for binary and multiclass classification problems
* Naive bayse classifier
* Suppport vector machines
* Classification model evaluation

Most of the classification algorithms computes the probability of belonging to a given class.

Observations are then assigned to the class that have the highest probability score.

Generaly, you need to decide a probability cutoff above which you consider the an observation as belonging to a given class.


Dataset will be using 

1. PimaIndiansDiabetes2 data set

The Pima Indian Diabetes data set is available in the mlbench package. It will be used for binary classification.

```{r}
# Load the data set
data("PimaIndiansDiabetes2", package = "mlbench")

# Inspect the data
head(PimaIndiansDiabetes2,4)
```

```{r}
str(PimaIndiansDiabetes2)
```

The data contains 768 individuals (female) and 9 clinical variables for predictin the probability of individuals in being diabete-positive or negative:

| Column Names | Description |
| ------------ | ----------- |
| pregnant | 	Number of times pregnant |
| glucose | 	Plasma glucose concentration (glucose tolerance test) |
| pressure | 	Diastolic blood pressure (mm Hg) |
| triceps | 	Triceps skin fold thickness (mm) |
| insulin | 	2-Hour serum insulin (mu U/ml) |
| mass | 	Body mass index (weight in kg/(height in m)\^2) |
| pedigree | 	Diabetes pedigree function |
| age | 	Age (years) |
| diabetes | 	Class variable (test for diabetes) |


Performing the following steps might improve the accuracy of your model

* Remove potential outliers
* Make sure that the predictor variables are normally distributed. If not, you can use log, root, Box-Cox transformation.
* Remove highly correlated predictors to minimize overfitting. The presence of highly correlated predictors might lead to an unstable model solution

```{r}
set.seed(123)

training.Pima.samples <- PimaIndiansDiabetes2$diabetes %>%
  createDataPartition(p = 0.8, list = FALSE)

train.pima.data <- PimaIndiansDiabetes2[training.Pima.samples, ]
test.pima.data <- PimaIndiansDiabetes2[-training.Pima.samples, ]
```


### Computing logistic regression

```{r}
# Fit the model
model.lr.pima <- glm(diabetes~., data = train.pima.data, family = binomial)

# Summarize the model
summary(model.lr.pima)

## Make predictions

probabilities <- model.lr.pima %>% predict(test.pima.data, type = "response")
predicted.classes <- ifelse(probabilities > 0.5, "pos", "neg") 
# predicted.classes
# Model accuracy
mean(predicted.classes == test.pima.data$diabetes)
```



#### Simple logistic regression

The simple logistic regression is used to predict the probability of class membership based on one single predictor variable.

The following R code builds a model to predict the probability of being  diabetes-positive based on the plasma glucose concentration:

```{r}
model.logit.simple <- glm(diabetes ~ glucose, data = train.pima.data, family = binomial)
summary(model.logit.simple)
```

The output above shows the estimate of the regression beta coefficients and their significance levels. The intercept (b0) is -5.55 and the coefficient of glucose variable is 0.039.

The logistic equation can be written as p = exp(-5.55 + 0.039*glucose)/[1+exp(-5.55+0.039*glucose)]. Using this formula, for each new glucose plasma concentration value, you can predict the probability of the individuals in bein diabetes positive.

Predictions can be easily made using the function predict(). Use the option type = "response" to directly obtain the probabilities

```{r}
newdata <- data.frame(glucose = c(20,190))
probabilities <- model.logit.simple  %>% predict(newdata, type = "response")
predicted.classes <- ifelse(probabilities > 0.5, "pos", "neg")
predicted.classes
```

```{r}
train.pima.data %>%
  mutate(prob = ifelse(diabetes == "pos", 1, 0)) %>%
  ggplot(aes(glucose,prob)) +
  geom_point(alpha = 0.2) +
  geom_smooth(method = "glm", method.args = list(family = "binomial"))+
  labs(
    title = "Simple Logistic Regression Model",
    x = "Plasma Glucose Concentration",
    y = "Probability of being diabetes-pos"
  )
```



```{r}
plot(model.logit.simple)
```




# Stepwise Logistic Regression

**Stepwise logistic regression** consists of automatically selecting a reduced number of predictor variables for building the best performing logistic regression model.


Data set: **PimaIndiansDiabetes2**


Quick start R code

```{r}
library(MASS)
# Fit the model

removed.missing.data <- na.omit(train.pima.data)
model <- glm(diabetes ~ ., data = removed.missing.data , family = binomial) %>% stepAIC(trace = FALSE)

summary(model)

removed.missing.data.from.test <- na.omit(test.pima.data)

probabilities.step <- model %>% predict(removed.missing.data.from.test, type = "response")
predicted.classes.step.logit <- ifelse(probabilities.step > 0.5, "pos", "neg")

#Model Accuracy
mean(predicted.classes.step.logit == removed.missing.data.from.test$diabetes)
```

```{r}
df <- data.frame("Predicted" = predicted.classes.step.logit)
(df$Predicted == test.pima.data$diabetes)
```



## Penalized logistic regression

**Penalized logistic regression** imposes a penalty to the logistic model for having too many variables. This results in shrinking the coefficients of the less contributive variables toward zero. This is also know as **regularization**.
The most commonly used penalized regression include

* **Ridge regression**: variables with minor contribution have their coefficients close to zero.
However, all the variables are incorporated in the model. This is useful when all variables need to be incorporated in the model according to domain knowledge.

* **lasso regression**: the coefficients of some less contributive variables aer forced to be exactly zero. Only the most significant variables ae kept in the final model.

* **elastic net regression**: the combination of ridge and lasso regression. It shrinks some coefficients toward zero (like ridge regerssion) and set some coefficients to exactly zero (like lasso regression)


Required packages

```{r}
library(tidyverse)
library(caret)
library(glmnet)
```

Preparing the data

Data set: PimaIndiansDiabetes2

```{r}
# Load the data and remove NAs
Pima.Indians.data <- na.omit(PimaIndiansDiabetes2)
# Inspect the data
sample_n(Pima.Indians.data, 3)

# Split the data into training and test set
set.seed(123)

training.samples <- Pima.Indians.data$diabetes %>%
  createDataPartition(p = 0.8, list = FALSE)

train.pima.indians <- Pima.Indians.data[training.samples, ]
test.pima.indians <- Pima.Indians.data[-training.samples, ]
```

Additional data preparation

The R function model.matrix() help to create the matrix of predictors and also automatically converts categorical predictors to appropriate dummy variables, which is required for the glmnet() function.

```{r}
# Dummy code categorical predictor variables
x <- model.matrix(diabetes ~., train.pima.indians)[,-1]

# Convert the outcome (class) to a numerical variable
y <- ifelse(train.pima.indians$diabetes == "pos", 1, 0)
```



We'll use the R function glmnet() for computing penalized logistic regression.

The simplified format is as follow:


```{r}
library(glmnet)
glmnet(x, y, family = "binomial", alpha = 1, lambda = NULL)
```


```{r}
set.seed(123)
cv.lasso <- cv.glmnet(x,y, alpha = 1, family = "binomial")
# cv.lasso
# summary(cv.lasso)
# Fit the final model on the training data
model.logit.lasso <- glmnet(x,y, alpha = 1, family = "binomial", lambda = cv.lasso$lambda.min)

# Display regression coefficients
coef(model.logit.lasso)

# Make predictions on the test data
x.test <- model.matrix(diabetes~., test.pima.indians)[,-1]
probabilities.logit.lasso <- model.logit.lasso %>% predict(newx = x.test)
predict.classess.logit.lasso <- ifelse(probabilities.logit.lasso > 0.5, "pos", "neg")

#Model accuracy
observed.classes <- test.pima.indians$diabetes
mean(predict.classess.logit.lasso == observed.classes)
```

```{r}
plot(cv.lasso)
```


The plot displays the cross-validation error according to the log of lambda. The left dashed vertical line indicates that the log of the optimal value of lambda is approximately -5, which is the one that minimizes the prediction error. This lambda value will give the most accurate model. The exact value of lambda can be viewed as follow:

```{r}
cv.lasso$lambda.min
cv.lasso$lambda.1se
```

Using lambda.min as the best lambda, gives the following regression coefficients:
```{r}
coef(cv.lasso, cv.lasso$lambda.min)
```

```{r}
coef(cv.lasso, cv.lasso$lambda.1se)
```



### Logistic Regression Assumptions and Diagnostics


The logistic regression method assumes that:

* The outcome is a binary or dichotomous variable like yes vs no, positive vs negative, 1 vs 0.
* there is alineare relationship between the logit of the outcome and each predictor variables. Recall that the outcome and each predictor variables. Recall that the logit function is logit(p) = log(p/(1-p)), where p is the probabilities of the outcome

* There is no influential value (extreame values or outliers) in the continuous predictors
* There is no high intercorrelation (i.e. multicollinearity) among the predictors.

Loading reequired R packages

```{r}
library(tidyverse)
library(broom)
theme_set(theme_classic())
```

```{r}
PimaIndiansDiabetes2.cleaned <- na.omit(PimaIndiansDiabetes2)
# Fit the logistic regression model
model.pima.logit2 <- glm(diabetes~., data = PimaIndiansDiabetes2.cleaned, family = binomial)
# Predict the probability (p) of diiabete positivity
probabilities.pima.logit <- predict(model.pima.logit2, type = "response")
predicted.pima.classes <- ifelse(probabilities.pima.logit > 0.5, "pos", "neg")
head(predicted.pima.classes)
```


```{r}
# train.pima.indians
mydata <- PimaIndiansDiabetes2.cleaned  %>%
  dplyr::select_if(is.numeric)
predictors <- colnames(mydata)
# Bind the logit and tidying the data for plot
model.pima.logit <- mydata %>%
  mutate(logit = log(probabilities.pima.logit/(1-probabilities.pima.logit))) %>%
  gather(key = "predictors", value = "predictor.value", -logit)
```
```{r}
ggplot(model.pima.logit, aes(logit, predictor.value)) +
  geom_point(size = 0.5, alpha = 0.5) +
  geom_smooth(method = "loess") +
  theme_bw() +
  facet_wrap(~predictors, scales = "free_y")
```

### Influential values

Influential values are extreme individual data points that can alter the quaity of the logistic regression model.
The most extreme values in the data can be examined by visualizing the Cook's distance values.

Here we label the top 3 largest values:

```{r}
plot(model.pima.logit2, which = 4, id.n = 3)
```

Note that, not all outliers are influential observations. To check kwhether the data contains potential influential observations, the standardized residual error can be inspected. Data points with an absolute standardized residuals above 3 represent possible outliers and may deserve closer attention.

The following R code computes the standardized residuals (.std.resid) and the Cook's distance (.cooksd) using the R function augment()[broom package].


```{r}
# Extract model results

model.pima.logit.data <- augment(model.pima.logit2) %>%
  mutate(index = 1:n())
```

The data for the top 3 largest values, according to the Cook's distance, can be displayed as follow:

```{r}
model.pima.logit.data %>% top_n(3, .cooksd)

```

Plot the standardized residuals:

```{r}
ggplot(model.pima.logit.data, aes(index, .std.resid)) +
  geom_point(aes(color = diabetes), alpha = 0.5) +
  theme_bw()
```


Filter potential influential data points with abs(.std.res) > 3

```{r}
model.pima.logit.data %>% filter(abs(.std.resid) > 3)
```
> There is no influential observation in our data.

When we have outliers in a continuous predictor, potential solutions include:

* Removing the concerned records
* Transform the data into log scale
* Use non parameteric methods

### Multicollinearity

Multicollinearity corresponds to a situation where the data contain highly correlated predictor variables.

Multicollinearity is an important issue in regression analysis and should be fixed by removing the concerned variables. It can be assessed using the R function vif()

```{r}
car::vif(model.pima.logit2)
```

As a rule of thumb, a VIF value that exceeds 5 or 10 indicates a problematic amount of collinearity. In our example there is no collinearity: all variables have a value of VIF well below 5.


## Multinomial logistic regression

The **multinomial logistic regressin** is an extension of the logistic regression for multclass classifcation tasks. It is used when the outcome involves more than two classes.

Loading required R packages
```{r}
library(tidyverse)
library(caret)
library(nnet)
```

### Preparing the data

We'll use the iris data set

```{r}
# Load the data
data("iris")
# Inspect the data
sample_n(iris,3)

# Split the data into training and test set
set.seed(123)
training.samples.iris <- iris$Species %>% createDataPartition(p=0.8, list = FALSE)
train.iris.data <- iris[training.samples.iris,]
test.iris.data <- iris[-training.samples.iris,]
```


### Computing multinomial logistic regression
```{r}
# Fit the model
model <- nnet::multinom(Species ~., data = train.iris.data)
# Summarize the model
summary(model)

# Make predictions

predicted.species <- model %>% predict(test.iris.data)
head(predicted.species)
# Model accuracy
mean(predicted.species == test.iris.data$Species)
```

## Discriminant Analysis

The following discriminant analysis methods will be described:

* Linear discriminant analysis (LDA): Uses linear combinations of predictors to predict the class of a given observation. Assumes that the predictor variables (p) are normally distributed and the classes have identical variances (for univariate analysis, p = 1) or identical covariance matrices (for multvariate analysis, p > 1).

* Quadratic discriminant analysis (QDA): More flexible than LDA. Here, there is no assumption that the covariance matrix of classes is the same

* Mixture discriminant analysis (MDA): Each class is assumed to be a Gaussian mixture of subclasses.

* Flexible Discriminant Analysis (FDA): Non-linear combination of predictors is used such as splines.

* Regularized discriminant analysis (RDA): Regularization (or shrinkage) improves the estimate of the covariance matrices in situation where the number of predictors is larger than the number of samples in the training data. This leads to an improvement of the discriminant analysis.

### Loading required R packages

```{r}
library(tidyverse)
library(caret)
theme_set(theme_classic())
```


### Preparing the data
1. Split the data

```{r}
# Lodd the data
data("iris")
# Split the data into training and test set
set.seed(123)
training.samples.iris <- iris$Species %>%
  createDataPartition(p=0.8, list = FALSE)

train.data.iris <- iris[training.samples.iris, ]
test.data.iris <- iris[-training.samples.iris, ]
```

2. Normalize the data. Categorical 

```{r}
# Estimate preprocessing parameters
preproc.pram <- train.data.iris %>%
  preProcess(method = c("center", "scale"))

# Transform the data using the estimated parameters
train.transformed <- preproc.pram %>% predict(train.data.iris)
test.transformed <- preproc.pram %>% predict(test.data.iris)
```


Before performing LDA, consider:
* Inspecting the univariate distribution of each variable and make sure that they are normally distribute. If not, you can transform them using log and root for exponential distributions and Box-Cox for skewed distributions.

*removing outliers from your data and standardize the variables to make their scale comparable.

## R code
```{r}
library(MASS)

# Fit the model
model.lda <- lda(Species ~., data = train.transformed)
# Make predictions
predictions <- model.lda %>% predict(test.transformed)

# Model accuracy
mean(predictions$class == test.transformed$Species)
```

```{r}
model.lda
```

```{r}
summary(model.lda)
```


LDA determines group means and computes, for each individual, the probability of belonging to the different groups. The individual is then affected to the group with the highest probability score.

The **lda()** outputs contain the following elements:
* **prior probabilities of groups**: the proportion of training observation in each group. For example, there are 33% of the training observations in the setosa group
* **Group means**: group center of gravity. Shows the mean of each variable in each group.
* **Coefficients of linear discriminants**: Shows the linear combination of predictor variables that are used to form the LDA decisionrule. For example

LD1 = 0.01\*Sepal.Length + 0.64\*Sepal.Width - 4.08\*Petal.Length = 2.3\*Petal.Width
LD2 = 0.03\*Sepal.Length + 0.89\*Sepal.Width - 2.2\*Petal.Length - 2.6\*Petal.Width

Using the function **Plot()** produces plots of the linear discriminants, obtained by computing LD1 and LD2 for each of the training observations

```{r}
plot(model.lda)
```

```{r}
lda.data <- cbind(train.transformed, predict(model.lda)$x)

ggplot(lda.data, aes(LD1, LD2)) +
  geom_point(aes(color = Species))
```


> It can be seen that, our model correctly classified 100% of observations, which is excellent.

Note that, by default, the probability cutoff used to decide group-membership is 0.5 

In some situation, you might want to increase the precision of the model. In this case you can fine-tune the model by adjusting the posterior probability cutoff. For example, you can increase or lower the cutoff

#### Variable selection:

Note that, if the predictor variables are standardized before computing LDA, the discriminator weights can be used as measures of variable importance for feature selection

### Quardratic discriminant analysis - QDA

QDA is little bit more flexible than LDA, in the sense that it does not assumes the equality of variance/covariance. In other words, for QDA the covariance matrix can be different for each class.

LDA tends to be a better than QDA then you have a small training set.

In contrast, QDA is recommended if the training set is very large, so that the variance of the classifier is not a major issue, or if the assumption of a common covariance matrix for the K classes if clearly untenable

QDA can be computed using the R function qda() [MASS package]

```{r}
library(MASS)
#Fit the model
model.qda <- qda(Species~., data = train.transformed)
model.qda

# Make predictions
predictions.qda <- model.qda %>% predict(test.transformed)

# Model accuracy
mean(predictions.qda$class == test.transformed$Species)
```


## Mixture discriminant analysis

The LDA classifier assumes that each class comes from a single normal (or Gaussian ) distribution.

This is too restrictive.

For MDA, ther are classes, and each class is assumed to be a Gaussian mixture of subclasses, where each data point has a probability of belonging to each class. Equality of covariance matrix, among classes, is still assumed

```{r}
library(mda)
model.mda <- mda(Species~., data = train.transformed)
model.mda

# Make predictions

predicted.classes.mda <- model.mda %>% predict(test.transformed)

# Model accuracy
mean(predicted.classes.mda == test.transformed$Species)
```


### Flexible discriminant analysis - FDA 

FDA is a flexible extension of LDA that uses non-linear combinations of predictors such as splines. FDA is useful to model multivariate non-normality or non-linear relationships among variables within each group, alowing for a more accurate classification

```{r}
library(mda)
# Fit the model
model.fda <- fda(Species~., data = train.transformed)
model.fda

predicted.classes.fda <- model.fda %>% predict(test.transformed)
#Model accuracy
mean(predicted.classes.fda == test.transformed$Species)
```


### Regularized discriminant analysis

RDA builds a classification rule by regularizing the group covariance matrices allowing a more robust model against multicollinearity in the data. This might be very useful for a large multivariate data set containing highly correlated predictors.

Regularized discriminant analysis is a kind of a trade-off between LDA and QDA. Recall that, in LDA we assume equality of covariance matrix for all of the classes. QDA assumes different covariance matrices for all the classes. QDa assumes different covariance matrices for all the classes. Regularized discriminant analysis is an intermediate between LDA and QDA.

RDA shrinks the separte covariances of QDA toward a common covariance as in LDA. This improves the estimate of the covariance matrices in situations where the number of predictors is larger thant the number of samples in the training data, potentially leading to an improvement of the model accuracy.

```{r}
library(klaR)
# Fit the model
model.rda <- rda(Species~., data = train.transformed)

#  Make predictions
predictions.rda <- model.rda %>% predict(test.transformed)

# Model accuracy

mean(predictions.rda$class == test.transformed$Species)
```


## Naive Bayes Classifier

The **Naive Bayes Classifier** is a simple and powerful method that can be used for binary and multiclass classification problems.

Naive Bayes classifier predicts the class membership probability of observations using Bayes theorem, which is based on conditional probability, that is the probability of something to happen, given that something else has already occured.


Will use PimaIndiansDiabetes2 data set

#### Computing Naive Bayes

```{r}
library(klaR)
# Fit the model
model.naive <- NaiveBayes(diabetes ~., data = train.pima.indians)

model.naive

```

```{r}
# Make predictions
prediction.naive <- model.naive %>% predict(test.pima.indians)

# Model accuracy
mean(prediction.naive$class == test.pima.indians$diabetes)
```

#### Using caret R package

The caet R package can automatically train the model and assess the modle accuracy using k-fold cross-validation

```{r}
library(klaR)

set.seed(123)

model.naive2 <- train(diabetes~., data = train.pima.indians, method = "nb", trControl = trainControl("cv", number = 10))

# make predictions
predicted.classes.naive2 <- model.naive2 %>% predict(test.pima.indians)

# Model n accuracy

mean(predicted.classes.naive2 == test.pima.indians$diabetes)
```


### Support Vector Machine

**Support Vector Machine (or SVM)** is a machine learning technique used for classification tasks. Briefly, SVM works by identifying the optimal decision boundary that separates data points from different groups (or classes), and then predicts the class of new observations based on the separation boundary.

Depending on the situations, the different groups might be separable by a linear straight line or by a non-linear boundary line.

Support vector machine methods can handle both linear and non-linear class boundaries. It can be used for both two-classs and multi-class classification problems.

In real life data, the separation boundary is generally nonlinear. Technically, the SVM algorithm perform a non-linear classification using what is called the kernel trick. The most commonly used kernel transformations ar e_polynomial kernel_ and _radial kernel_

Note that, there is also an extension of the SVM for regression, called support vector regression.

#### Loading required R packages

```{r}
library(tidyverse)
library(caret)
```


#### Dataset

Data set: PimaIndiansDiabetes2 [in mlbench package]

```{r}
# Load the data
data("PimaIndiansDiabetes2", package = "mlbench")
pima.data.cleaned <- na.omit(PimaIndiansDiabetes2)

# Split the data into training and test set
set.seed(123)

t.sample <- pima.data.cleaned$diabetes %>% createDataPartition(p = 0.8, list = FALSE)
```

#### SVM linear classifier

In the following example variables are normalized to make their scale comparable. This is automatically done before building the SVM classifier by setting the option preProcess = c("center", "scale").

```{r}
# Fit the model on the training set
set.seed(123)

model.svm <- train(
  diabetes~., data = train.pima.indians, method = "svmLinear",
  trContro = trainControl("cv", number = 10),
  preProcess = c("center", "scale")
)

# Make predictions on the test data
predicted.classes.svm <- model.svm %>% predict(test.pima.indians)
head(predicted.classes.svm)
```

```{r}
# Computed model accuracy rate
mean(predicted.classes.svm == test.pima.indians$diabetes)
```

Note that, there is a tuning parameter C, also known as _Cost_, that determines the possible misclassifications. It essentially imposes a penalty to the model for making an error. The higher the value of C, the less likely it is that the SVM algorithm will misclassify a point.

By default **caret** builds the SVM linear classifier usin g C= 1. You can check this by typing model in R console

```{r}
model.svm
```

The following R code compute SVM for a grid values of C and choose automatically the final model for predictions:

```{r}
# Fit the model on the training set
set.seed(123)

model.svm2 <- train(
  diabetes ~., data = train.pima.indians, method = "svmLinear",
  trControl = trainControl("cv", number = 10),
  tuneGrid = expand.grid(C = seq(0.1, 2, length = 20)),
  preProcess = c("center", "scale")
)

# Plot model accuracy vs different values of Cost
plot(model.svm2)
```


```{r}
# Print the best tuning parameter C that maximizes model accuracy
model.svm2$bestTune
```


Let's use the best model 

```{r}
# Make predictions on the test data 
predicted.classess.svm2 <- model.svm2 %>% predict(test.pima.indians)

# Compute model accuracy rate
mean(predicted.classess.svm2 == test.pima.indians$diabetes)
```


### SVM classifier using Non-Linear Kernel

To build a non-linear SVM classifier, we can use either polynomial kernel or radial kernel function. 

* **Computing SVM using radial basis kernel**

```{r}
# Fit the model on the training set
set.seed(123)
model.svm.radial <- train(
  diabetes~., data = train.pima.indians, method = "svmRadial",
  trControl = trainControl("cv", number = 10),
  preProcess = c("center", "scale"),
  tuneLength = 10
)

# Print the best tuning parameter sigma and C that maximizes model accuracy

model.svm.radial$bestTune
```


```{r}
# Make predictions on the test data
predicted.classess.svm.radial <- model.svm.radial %>% predict(test.pima.indians)
# Compute model Accuracy rate
mean(predicted.classess.svm.radial == test.pima.indians$diabetes)
```

* **Computing SVM using polynomial basis kernel**:

```{r}
# Fit the model on the training set
set.seed(123)
model.svm.polynomial <- train(
  diabetes~., data = train.pima.indians, method = "svmPoly",
  trControl = trainControl("cv", number = 10),
  preProcess = c("center", "scale"),
  tuneLength = 4
)

# Print the best tuning parameter sigma and C that maximizes model accuracy

model.svm.polynomial$bestTune
```




```{r}
# Make predictions on the test data
predicted.classess.svm.polynomial <- model.svm.polynomial %>% predict(test.pima.indians)
# Compute model Accuracy rate
mean(predicted.classess.svm.polynomial == test.pima.indians$diabetes)
```


> In our examples, it can be seen that the SVM classifier using non-linear kernel gives a better result compared to the linear model.



## Classification Model Evaluation

After building a predictive classification model, you need to **evaluate the performance of the model**, that is how good the model is in predicting the outcome of new observations test data that have been not used to train the model.

The common use metrics and methods for assessing the performance of predictive classification models:

* **Average classification accuracy** - representing the proportion of correctly classified observations.

* **Confusion matrix** - which is 2x2 table showing four parameters, including the number of true positives, true negatives, false negatives and false positives.

* **Precision, Recall and Specificity** - which are three major performance metrics describing a predictive classification model

* **ROC curve** - which is a graphical summary of the overall performance of the model, showing the proportion of true positives and false positives at tall possible values of probability cutoff. The **Area Under the Curve(AUC)** summarizes the overall performance of the classifier.

#### Required R packages

* tidyverse for easy data manipulation and visualization
* caret for easy machine learning workflow

```{r}
library(tidyverse)
library(caret)
```

#### Building a classification model

To keep things simple we'll perform a binary classification, where the outcome variable can have only two possible values: negative vs positive.

We'll compute an example of linear discriminant analysis model using the **PimaIndiansDiabetes2** [mlbench package]

1. Split the data into training (80%, used to build the model) and test set (20%, used to evaluate the model performance):

```{r}
data("PimaIndiansDiabetes2", package = "mlbench")
pima.data.cleaned <- na.omit(PimaIndiansDiabetes2)
# Split the data into training and test set
set.seed(123)

t.sample <- pima.data.cleaned$diabetes %>% createDataPartition(p = 0.8, list = FALSE)

train.pima2 <- pima.data.cleaned[t.sample, ]
test.pima2 <- pima.data.cleaned[-t.sample, ]
```

2. Fit the LDA model on the training set and make preditions on the test data:

```{r}
library(MASS)
#Fit LDA

fit.lda <- lda(diabetes ~., data = train.pima2)

# Make predictions on the test data
predictions.pima2 <- predict(fit.lda, test.pima2)
preictions.probabilities2 <- predictions.pima2$posterior[,2]
predicted.classess2 <- predictions.pima2$class
observed.classes2 <- test.pima2$diabetes
```

#### Overall classification accuracy
The overall **classification accuracy** rate corresponds to the proportion of observations that have been correctly classified. Determining the raw classification accuracy is the first step in assessing the performance of a model.

```{r}
accuracy <- mean(observed.classes2 == predicted.classess2)
accuracy
```

```{r}
error <- mean(observed.classes2 != predicted.classess2)
error
```

> From the output avove, the linear discrimant analysis correctly predicted the individual outcome in 81% of the cases. This is by far better than random guessing. The misclassification error rate can be calculated as 100-81 = 19%

In our example, a binary classifier can make two types of errors:

* it can incorrectly assign an individual who is diabetes-positive to the diabetes-negative category
* it can incorrectly assign an individual who is diabetes-negative to the diabetes-positive category.

The proportion of these two types of errors can be determined by creating a **confution matrxi**, which compare the predicted coutcome values agains the known outcome vaues.

### Confusion matrix

The R function table() can be used to produce a **confusion matrix** in order to determin how many observations were correctly or incorrectly classified. It compares the observed and the predicted outcome values and shows the number of correct and incorrect predictions categorized by type of outcome.

```{r}
# Confusion matrix, number of cases
table(observed.classes2, predicted.classess2)
```


```{r}
# Confusion matrix, proportion of cases
table(observed.classes2, predicted.classess2) %>% prop.table() %>% round(digits = 3)
```

> The diagonal elements of the confusion matrix indicate correct predictions, while the off diagonals represent incorrect prediction. So, the correct classification rate is the sum of the number on the diagonal divided by the sample size in the test data. In our example, that is (48+15)/78 = 81%


* **True positives** (d): these are cases in which we predicted the individuals would be diabetes-positive and they were .
* **True negatives** (a): We predicted diabetes-negative, and the individuals were diabetes-negative
* **False positives** (b): We predicted diabetes-positive, but the individuals didn't actually hae diabetes. (Alsow known as a _Type I error._)
* **False negatives** (c): We predicted diabetes-negative, but they did have diabetes. (Also known as a _Type II error._)

>Technically the raw prediction accuracy of the model is defined as (TruePositives + TrueNegatives)/SampleSize.


### Precision, Recall and Specificity

In addition to the raw classification accuracy, there are many other metrics that are widely used to examine the performance of a classification model, including:

**Precision**, which is the proportion of true positives among all the individuals that have been predicted to be diabetes-positive by the model. This represents the accuracy of a predicted positive outcome. Precision = TruePositives/(TruePositives + FalsePOsitives)

**Sensitivity (or Recall)**, which is the **True Positive Rate** (TPR) or the proportion of identified positives among the diabetes-positive population (class = 1). Sensitivity = TruePOsitives/(TruePositives + FalseNegatives).

**Specificity**, which measures the **True Negative Rate** (TNR), that is the proportion of identified negatives among the diabetes-negative population (class = 0). Specificity = TrueNegatives/(TrueNegatives + FalseNegatives).

**False POsitive Rate** (FPR), which represents the proportion of identified positives among the healthy individuals (i.e. diabetes-negative). This can be seen as a false alarm. The FPR can be also calculated as 1-specificity. When positives are rare, the FPR can be high, leading to the situation where a predicted positive is most likely a negative.

```{r}
confusionMatrix(observed.classes2, predicted.classess2, positive = "pos")
```


> In our example, the sensitivity is ~58%, that is the proportion of diabetes-positive individuals that were correctly identified by the model as diabetes-positive.
The specificity of the model is ~92%, that is the proportion of diabetes-negative individuals that were correctly identified by the model as diabetes-negative.
The model precision or the proportion of positive predicted value is 79%.


### ROC curve

The **ROC curve (or receiver operating characteristics curve)** is apopular graphical measure for assessing the performance or the accuracy of a classifier, which corresponds to the total

### Computing and plotting ROC curve

The ROC analysis can be easily performed using the R package pROC

```{r}
library(pROC)
res.roc <- roc(observed.classes2,preictions.probabilities2)
plot.roc(res.roc, print.auc = TRUE)
```

```{r}
# Extract some interesting results
roc.data <- data_frame(
  thresholds = res.roc$thresholds,
  sensitivity = res.roc$sensitivities,
  specificity = res.roc$specificities
)

# Get the probability threshold for specificity = 0.6

roc.data %>% filter(specificity >= 0.6)
```




```{r}
plot.roc(res.roc, print.auc = TRUE, print.thres = "best")
```

```{r}
plot.roc(res.roc, print.thres = c(0.3,0.5,0.7))
```




